diff --git a/.clang-format b/.clang-format
index 866b7e2117..a8bc4967c3 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,149 +1,9 @@
 ---
 Language:        Cpp
-# BasedOnStyle:  Google
-# Generated with clang-format 7.0.1
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
+BasedOnStyle:  Google
 AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
 Cpp11BracedListStyle: false
 DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Preserve
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakTemplateDeclaration: 10
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
-RawStringFormats:
-  - Language:        Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-  - Language:        TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-ReflowComments:  true
 SortIncludes:    false
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Auto
-TabWidth:        8
-UseTab:          Never
-...
-
diff --git a/.gitignore b/.gitignore
index 5f26835386..99eeb92d0b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,8 +7,10 @@
 *.o
 *~
 .cproject
+.idea
 .project
 .settings
+.vscode
 /*-*.mk
 /*.asm
 /*.doxy
diff --git a/.mailmap b/.mailmap
index 8d97500994..bb0ddd95b2 100644
--- a/.mailmap
+++ b/.mailmap
@@ -12,6 +12,8 @@ Deb Mukherjee <debargha@google.com>
 Elliott Karpilovsky <elliottk@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Fyodor Kyslov <kyslov@google.com>
+Gregor Jasny <gjasny@gmail.com>
+Gregor Jasny <gjasny@gmail.com> <gjasny@googlemail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hui Su <huisu@google.com>
@@ -19,10 +21,11 @@ Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann <johann@duck.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Konstantinos Margaritis <konma@vectorcamp.gr> <konstantinos@vectorcamp.gr>
 Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Martin Storsjö <martin@martin.st>
diff --git a/AUTHORS b/AUTHORS
index 352c91feda..2db4a113e4 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -3,6 +3,7 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam B. Goode <adam.mckee84@gmail.com>
 Adrian Grange <agrange@google.com>
 Ahmad Sharif <asharif@google.com>
 Aidan Welch <aidansw@yahoo.com>
@@ -20,11 +21,14 @@ Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
+Andrew Salkeld <andrew.salkeld@arm.com>
 Angie Chen <yunqi@google.com>
 Angie Chiang <angiebird@google.com>
+Anton Venema <anton.venema@liveswitch.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 Birk Magnussen <birk.magnussen@googlemail.com>
+Bohan Li <bohanli@google.com>
 Brian Foley <bpfoley@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
@@ -34,9 +38,11 @@ Chi Yo Tsai <chiyotsai@google.com>
 chm <chm@rock-chips.com>
 Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
+Chunbo Hua <chunbo.hua@intel.com>
 Clement Courbet <courbet@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
+Daniel Sommermann <dcsommer@gmail.com>
 Dan Zhu <zxdan@google.com>
 Deb Mukherjee <debargha@google.com>
 Deepa K G <deepa.kg@ittiam.com>
@@ -64,15 +70,19 @@ Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
 Han Shen <shenhan@google.com>
+Hao Chen <chenhao@loongson.cn>
 Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
 Hien Ho <hienho@google.com>
+Hirokazu Honda <hiroh@chromium.org>
 Hui Su <huisu@google.com>
+Ilya Kurdyukov <jpegqs@gmail.com>
 Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
 Jacky Chen <jackychen@google.com>
 James Berry <jamesberry@google.com>
+James Touton <bekenn@gmail.com>
 James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
 Jan Gerber <j@mailb.org>
@@ -82,16 +92,22 @@ Jean-Yves Avenard <jyavenard@mozilla.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jeremy Leconte <jleconte@google.com>
 Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
+Jianhui Dai <jianhui.j.dai@intel.com>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
+jinbo <jinbo-hf@loongson.cn>
+Jin Bo <jinbo@loongson.cn>
 Jingning Han <jingning@google.com>
+Joel Fernandes <joelaf@google.com>
 Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
+Jonathan Wright <jonathan.wright@arm.com>
 Jon Kunkee <jkunkee@microsoft.com>
 Jorge E. Moreira <jemoreira@google.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
@@ -101,6 +117,7 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
+Konstantinos Margaritis <konma@vectorcamp.gr>
 Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
@@ -108,6 +125,7 @@ Liu Peng <pengliu.mail@gmail.com>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Luc Trudeau <luc@trud.ca>
+Lu Wang <wanglu@loongson.cn>
 Makoto Kato <makoto.kt@gmail.com>
 Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
@@ -121,6 +139,7 @@ Michael Kohler <michaelkohler@live.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Mikko Koivisto <mikko.koivisto@unikie.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
@@ -141,6 +160,7 @@ Pengchong Jin <pengchong@google.com>
 Peter Boström <pbos@chromium.org>
 Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
+Peter Kasting <pkasting@chromium.org>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
@@ -156,7 +176,9 @@ Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sai Deng <sdeng@google.com>
+Salome Thirot <salome.thirot@arm.com>
 Sami Pietilä <samipietila@google.com>
+Sam James <sam@gentoo.org>
 Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
 Scott Graham <scottmg@chromium.org>
@@ -195,6 +217,7 @@ xiwei gu <guxiwei-hf@loongson.cn>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
+yuanhecai <yuanhecai@loongson.cn>
 Yue Chen <yuec@google.com>
 Yun Liu <yliuyliu@google.com>
 Yunqing Wang <yunqingwang@google.com>
diff --git a/CHANGELOG b/CHANGELOG
index e731fc6121..21070785ed 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,8 +1,150 @@
+20yy-mm-dd v1.14.0 "V Duck"
+  This release drops support for old C compilers, such as Visual Studio 2012
+  and older, that disallow mixing variable declarations and statements (a C99
+  feature).
+
+2023-09-29 v1.13.1 "Ugly Duckling"
+  This release contains two security related fixes. One each for VP8 and VP9.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+
+  - Bug fixes:
+    https://crbug.com/1486441 (CVE-2023-5217)
+    Fix to a crash related to VP9 encoding (#1642, CVE-2023-6349)
+
+2023-01-31 v1.13.0 "Ugly Duckling"
+  This release includes more Neon and AVX2 optimizations, adds a new codec
+  control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes
+  numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP.
+
+    GoogleTest is upgraded to v1.12.1.
+
+    .clang-format is upgraded to clang-format-11.
+
+    VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the
+    feature of using external rate control models for vp9.
+
+  - Enhancement:
+    Numerous improvements on Neon optimizations.
+    Numerous improvements on AVX2 optimizations.
+    Additional ARM targets added for Visual Studio.
+
+  - Bug fixes:
+    Fix to calculating internal stats when frame dropped.
+    Fix to segfault for external resize test in vp9.
+    Fix to build system with replacing egrep with grep -E.
+    Fix to a few bugs with external RTC rate control library.
+    Fix to make SVC work with VBR.
+    Fix to key frame setting in VP9 external RC.
+    Fix to -Wimplicit-int (Clang 16).
+    Fix to VP8 external RC for buffer levels.
+    Fix to VP8 external RC for dynamic update of layers.
+    Fix to VP9 auto level.
+    Fix to off-by-one error of max w/h in validate_config.
+    Fix to make SVC work for Profile 1.
+
+2022-06-17 v1.12.0 "Torrent Duck"
+  This release adds optimizations for Loongarch, adds support for vp8 in the
+  real-time rate control library, upgrades GoogleTest to v1.11.0, updates
+  libwebm to libwebm-1.0.0.28-20-g206d268, and includes numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+
+    vp8 support in the real-time rate control library.
+    New codec control VP8E_SET_RTC_EXTERNAL_RATECTRL is added.
+
+    Configure support for darwin21 is added.
+
+    GoogleTest is upgraded to v1.11.0.
+
+    libwebm is updated to libwebm-1.0.0.28-20-g206d268.
+
+    Allow SimpleEncode environment to take target level as input to match
+    the level conformance in vp9.
+
+  - Enhancement:
+    Numerous improvements on checking memory allocations.
+    Optimizations for Loongarch.
+    Code clean-up.
+
+  - Bug fixes:
+    Fix to a crash related to {vp8/vp9}_set_roi_map.
+    Fix to compiling failure with -Wformat-nonliteral.
+    Fix to integer overflow with vp9 with high resolution content.
+    Fix to AddNoiseTest failure with ARMv7.
+    Fix to libvpx Null-dereference READ in vp8.
+
+2021-09-27 v1.11.0 "Smew Duck"
+  This maintenance release adds support for VBR mode in VP9 rate control
+  interface, new codec controls to get quantization parameters and loop filter
+  levels, and includes several improvements to NEON and numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+    New codec control is added to get quantization parameters and loop filter
+    levels.
+
+    VBR mode is supported in VP9 rate control library.
+
+  - Enhancement:
+    Numerous improvements for Neon optimizations.
+    Code clean-up and refactoring.
+    Calculation of rd multiplier is changed with BDRATE gains.
+
+  - Bug fixes:
+    Fix to overflow on duration.
+    Fix to several instances of -Wunused-but-set-variable.
+    Fix to avoid chroma resampling for 420mpeg2 input.
+    Fix to overflow in calc_iframe_target_size.
+    Fix to disallow skipping transform and quantization.
+    Fix some -Wsign-compare warnings in simple_encode.
+    Fix input file path in simple_encode_test.
+    Fix valid range for under/over_shoot pct.
+
+2021-03-09 v1.10.0 "Ruddy Duck"
+  This maintenance release adds support for darwin20 and new codec controls, as
+  well as numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+    New codec control is added to disable loopfilter for VP9.
+
+    New encoder control is added to disable feature to increase Q on overshoot
+    detection for CBR.
+
+    Configure support for darwin20 is added.
+
+    New codec control is added for VP9 rate control. The control ID of this
+    interface is VP9E_SET_EXTERNAL_RATE_CONTROL. To make VP9 use a customized
+    external rate control model, users will have to implement each callback
+    function in vpx_rc_funcs_t and register them using libvpx API
+    vpx_codec_control_() with the control ID.
+
+  - Enhancement:
+    Use -std=gnu++11 instead of -std=c++11 for c++ files.
+
+  - Bug fixes:
+    Override assembler with --as option of configure for MSVS.
+    Fix several compilation issues with gcc 4.8.5.
+    Fix to resetting rate control for temporal layers.
+    Fix to the rate control stats of SVC example encoder when number of spatial
+    layers is 1.
+    Fix to reusing motion vectors from the base spatial layer in SVC.
+    2 pass related flags removed from SVC example encoder.
+
 2020-07-29 v1.9.0 "Quacking Duck"
   This release adds support for NV12, a separate library for rate control, as
   well as incremental improvements.
 
   - Upgrading:
+    This release is ABI compatible with the previous release.
     NV12 support is added to this release.
     A new interface is added for VP9 rate control. The new library libvp9rc.a
     must be linked by applications.
@@ -26,12 +168,14 @@
   This release collects incremental improvements to many aspects of the library.
 
   - Upgrading:
+    This release is ABI compatible with the previous release.
     ARCH_* defines have been removed in favor of VPX_ARCH_*.
 
 2019-07-15 v1.8.1 "Orpington Duck"
   This release collects incremental improvements to many aspects of the library.
 
   - Upgrading:
+    This release is ABI incompatible with the previous release.
     VP8E_SET_CPUUSED now accepts values up to 9 for vp9.
     VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E).
     The --sdk-path option has been removed. If you were using it to build for
@@ -50,7 +194,8 @@
   This release focused on encoding performance for realtime and VOD use cases.
 
   - Upgrading:
-    This adds and improves several vp9 controls. Most are related to SVC:
+    This release is ABI incompatible with the previous release. This adds and
+    improves several vp9 controls. Most are related to SVC:
       VP9E_SET_SVC_FRAME_DROP_LAYER:
         - Frame dropping in SVC.
       VP9E_SET_SVC_INTER_LAYER_PRED:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 577c96a6b5..7a73a30317 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,10 +19,9 @@ again.
 
 All submissions, including submissions by project members, require review. We
 use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
-https://chromium-review.googlesource.com for this purpose.
-
-See https://www.webmproject.org/code/contribute/submitting-patches for an
-example of a typical gerrit workflow.
+https://chromium-review.googlesource.com for this purpose. See the
+[WebM Project page](https://www.webmproject.org/code/contribute/submitting-patches/)
+for additional details.
 
 ## Community Guidelines
 
diff --git a/README b/README
index fd4c93a465..4c25b15d81 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README - 20 July 2020
+v1.13.1 Ugly Duckling
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -10,14 +10,14 @@ COMPILING THE APPLICATIONS/LIBRARIES:
   1. Prerequisites
 
     * All x86 targets require the Yasm[1] assembler be installed[2].
-    * All Windows builds require that Cygwin[3] be installed.
-    * Building the documentation requires Doxygen[4]. If you do not
+    * All Windows builds require that Cygwin[3] or MSYS2[4] be installed.
+    * Building the documentation requires Doxygen[5]. If you do not
       have this package, the install-docs option will be disabled.
-    * Downloading the data for the unit tests requires curl[5] and sha1sum.
+    * Downloading the data for the unit tests requires curl[6] and sha1sum.
       sha1sum is provided via the GNU coreutils, installed by default on
       many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
       available, a compatible version of sha1sum can be built from
-      source[6]. These requirements are optional if not running the unit
+      source[7]. These requirements are optional if not running the unit
       tests.
 
     [1]: http://www.tortall.net/projects/yasm
@@ -26,9 +26,10 @@ COMPILING THE APPLICATIONS/LIBRARIES:
          yasm-<version>-<arch>.exe to yasm.exe and place it in:
          Program Files (x86)/Microsoft Visual Studio/2017/<level>/Common7/Tools/
     [3]: http://www.cygwin.com
-    [4]: http://www.doxygen.org
-    [5]: http://curl.haxx.se
-    [6]: http://www.microbrew.org/tools/md5sha1sum/
+    [4]: http://www.msys2.org/
+    [5]: http://www.doxygen.org
+    [6]: http://curl.haxx.se
+    [7]: http://www.microbrew.org/tools/md5sha1sum/
 
   2. Out-of-tree builds
   Out of tree builds are a supported method of building the application. For
@@ -62,9 +63,17 @@ COMPILING THE APPLICATIONS/LIBRARIES:
 
     arm64-android-gcc
     arm64-darwin-gcc
+    arm64-darwin20-gcc
+    arm64-darwin21-gcc
+    arm64-darwin22-gcc
+    arm64-darwin23-gcc
     arm64-linux-gcc
     arm64-win64-gcc
     arm64-win64-vs15
+    arm64-win64-vs16
+    arm64-win64-vs16-clangcl
+    arm64-win64-vs17
+    arm64-win64-vs17-clangcl
     armv7-android-gcc
     armv7-darwin-gcc
     armv7-linux-rvct
@@ -73,8 +82,12 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv7-win32-gcc
     armv7-win32-vs14
     armv7-win32-vs15
+    armv7-win32-vs16
+    armv7-win32-vs17
     armv7s-darwin-gcc
     armv8-linux-gcc
+    loongarch32-linux-gcc
+    loongarch64-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
     ppc64le-linux-gcc
@@ -101,6 +114,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86-win32-vs14
     x86-win32-vs15
     x86-win32-vs16
+    x86-win32-vs17
     x86_64-android-gcc
     x86_64-darwin9-gcc
     x86_64-darwin10-gcc
@@ -113,6 +127,10 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-darwin17-gcc
     x86_64-darwin18-gcc
     x86_64-darwin19-gcc
+    x86_64-darwin20-gcc
+    x86_64-darwin21-gcc
+    x86_64-darwin22-gcc
+    x86_64-darwin23-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc
@@ -121,6 +139,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-win64-vs14
     x86_64-win64-vs15
     x86_64-win64-vs16
+    x86_64-win64-vs17
     generic-gnu
 
   The generic-gnu target, in conjunction with the CROSS environment variable,
diff --git a/args.c b/args.c
index a87b138b9d..0a9631e1f4 100644
--- a/args.c
+++ b/args.c
@@ -16,8 +16,10 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 
-#if defined(__GNUC__) && __GNUC__
-extern void die(const char *fmt, ...) __attribute__((noreturn));
+#if defined(__GNUC__)
+__attribute__((noreturn)) extern void die(const char *fmt, ...);
+#elif defined(_MSC_VER)
+__declspec(noreturn) extern void die(const char *fmt, ...);
 #else
 extern void die(const char *fmt, ...);
 #endif
@@ -81,6 +83,7 @@ const char *arg_next(struct arg *arg) {
 
 char **argv_dup(int argc, const char **argv) {
   char **new_argv = malloc((argc + 1) * sizeof(*argv));
+  if (!new_argv) return NULL;
 
   memcpy(new_argv, argv, argc * sizeof(*argv));
   new_argv[argc] = NULL;
@@ -132,7 +135,6 @@ unsigned int arg_parse_uint(const struct arg *arg) {
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
 }
 
 int arg_parse_int(const struct arg *arg) {
@@ -149,7 +151,6 @@ int arg_parse_int(const struct arg *arg) {
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
 }
 
 struct vpx_rational {
@@ -206,7 +207,6 @@ int arg_parse_enum(const struct arg *arg) {
     if (!strcmp(arg->val, listptr->name)) return listptr->val;
 
   die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
-  return 0;
 }
 
 int arg_parse_enum_or_int(const struct arg *arg) {
diff --git a/build/make/Android.mk b/build/make/Android.mk
index 6cb3af027b..ba24f541b1 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -8,6 +8,8 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 #
 # This file is to be used for compiling libvpx for Android using the NDK.
 # In an Android project place a libvpx checkout in the jni directory.
@@ -166,6 +168,9 @@ LOCAL_CFLAGS += \
     -I$(ASM_CNV_PATH)/libvpx
 
 LOCAL_MODULE := libvpx
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
   LOCAL_STATIC_LIBRARIES := cpufeatures
@@ -209,3 +214,4 @@ endif
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
 $(call import-module,android/cpufeatures)
 endif
+endif  # NDK_ROOT
diff --git a/build/make/Makefile b/build/make/Makefile
index be28102295..199ed78058 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -21,9 +21,9 @@ all: .DEFAULT
 clean:: .DEFAULT
 exampletest: .DEFAULT
 install:: .DEFAULT
-test:: .DEFAULT
-test-no-data-check:: .DEFAULT
-testdata:: .DEFAULT
+test: .DEFAULT
+test-no-data-check: .DEFAULT
+testdata: .DEFAULT
 utiltest: .DEFAULT
 exampletest-no-data-check utiltest-no-data-check: .DEFAULT
 test_%: .DEFAULT ;
@@ -111,13 +111,13 @@ exampletest:
 .PHONY: install
 install::
 .PHONY: test
-test::
+test:
 .PHONY: testdata
-testdata::
+testdata:
 .PHONY: utiltest
 utiltest:
 .PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
-test-no-data-check::
+test-no-data-check:
 exampletest-no-data-check utiltest-no-data-check:
 
 # Force to realign stack always on OS/2
@@ -143,10 +143,28 @@ $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
 $(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
 $(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
 
+# AARCH64
+$(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod
+$(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod
+$(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
+$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
+$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+
 # POWER
 $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
 $(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx
 
+# MIPS
+$(BUILD_PFX)%_msa.c.d: CFLAGS += -mmsa
+$(BUILD_PFX)%_msa.c.o: CFLAGS += -mmsa
+
+# LOONGARCH
+$(BUILD_PFX)%_lsx.c.d:  CFLAGS += -mlsx
+$(BUILD_PFX)%_lsx.c.o:  CFLAGS += -mlsx
+$(BUILD_PFX)%_lasx.c.d: CFLAGS += -mlasx
+$(BUILD_PFX)%_lasx.c.o: CFLAGS += -mlasx
+
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
@@ -294,6 +312,19 @@ $(1):
 	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
 endef
 
+# Don't use -Wl,-z,defs with Clang's sanitizers.
+#
+# Clang's AddressSanitizer documentation says "When linking shared libraries,
+# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link
+# errors (don't use it with AddressSanitizer)." See
+# https://clang.llvm.org/docs/AddressSanitizer.html#usage.
+NO_UNDEFINED := -Wl,-z,defs
+ifeq ($(findstring clang,$(CC)),clang)
+    ifneq ($(filter -fsanitize=%,$(LDFLAGS)),)
+        NO_UNDEFINED :=
+    endif
+endif
+
 define so_template
 # Not using a pattern rule here because we don't want to generate empty
 # archives when they are listed as a dependency in files not responsible
@@ -303,7 +334,8 @@ define so_template
 $(1):
 	$(if $(quiet),@echo "    [LD] $$@")
 	$(qexec)$$(LD) -shared $$(LDFLAGS) \
-            -Wl,--no-undefined -Wl,-soname,$$(SONAME) \
+            $(NO_UNDEFINED) \
+            -Wl,-soname,$$(SONAME) \
             -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \
             $$(filter %.o,$$^) $$(extralibs)
 endef
@@ -455,6 +487,6 @@ INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins
 all: $(BUILD_TARGETS)
 install:: $(INSTALL_TARGETS)
 dist: $(INSTALL_TARGETS)
-test::
+test:
 
 .SUFFIXES:  # Delete default suffix rules
diff --git a/build/make/ads2armasm_ms.pl b/build/make/ads2armasm_ms.pl
index 2a2c470ff8..dd4e0318c4 100755
--- a/build/make/ads2armasm_ms.pl
+++ b/build/make/ads2armasm_ms.pl
@@ -28,7 +28,7 @@
     s/qsubaddx/qsax/i;
     s/qaddsubx/qasx/i;
 
-    thumb::FixThumbInstructions($_, 1);
+    thumb::FixThumbInstructions($_);
 
     s/ldrneb/ldrbne/i;
     s/ldrneh/ldrhne/i;
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index b6a8f53eae..c301b7f829 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -32,7 +32,7 @@
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas.pl script.\n";
-print "\t.syntax unified\n";
+print ".syntax unified\n";
 if ($thumb) {
     print "\t.thumb\n";
 }
@@ -42,39 +42,11 @@
 
 while (<STDIN>)
 {
-    undef $comment;
-    undef $line;
-    $comment_char = ";";
-    $comment_sub = "@";
-
-    # Handle comments.
-    if (/$comment_char/)
-    {
-      $comment = "";
-      ($line, $comment) = /(.*?)$comment_char(.*)/;
-      $_ = $line;
-    }
-
     # Load and store alignment
     s/@/,:/g;
 
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
+    # Comment character
+    s/;/@/;
 
     # Convert ELSE to .else
     s/\bELSE\b/.else/g;
@@ -82,105 +54,51 @@
     # Convert ENDIF to .endif
     s/\bENDIF\b/.endif/g;
 
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert endfunc to nothing.
-    s/\bendfunc\b//ig;
-
-    # Convert FUNCTION to nothing.
-    s/\bFUNCTION\b//g;
-    s/\bfunction\b//g;
-
-    s/\bENTRY\b//g;
-    s/\bMSARMASM\b/0/g;
-    s/^\s+end\s+$//g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
     # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
+    if (s/\bIF\b/.if/g) {
         s/=+/==/g;
     }
 
     # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
-
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
+    s/INCLUDE\s?(.*)$/.include \"$1\"/;
 
     # No AREA required
     # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/;
     # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
-
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
-
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
+    s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/;
 
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
-
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
+    # Make function visible to linker.
     if ($elf) {
-        s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
+        s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2\n$1.type $2, function/;
     } else {
-        s/EXPORT\s+\|([\$\w]*)\|/.global $1/;
+        s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2/;
     }
-    s/IMPORT\s+\|([\$\w]*)\|/.global $1/;
-
-    s/EXPORT\s+([\$\w]*)/.global $1/;
-    s/export\s+([\$\w]*)/.global $1/;
 
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
+    # No vertical bars on function names
+    s/^\|(\$?\w+)\|/$1/g;
 
     # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
     s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
 
     # ALIGN directive
     s/\bALIGN\b/.balign/g;
 
     if ($thumb) {
-        # ARM code - we force everything to thumb with the declaration in the header
-        s/\sARM//g;
+        # ARM code - we force everything to thumb with the declaration in the
+        # header
+        s/\bARM\b//g;
     } else {
         # ARM code
-        s/\sARM/.arm/g;
+        s/\bARM\b/.arm/g;
     }
 
     # push/pop
     s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g;
     s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g;
 
-    # NEON code
-    s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g;
-    s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g;
-
     if ($thumb) {
-        thumb::FixThumbInstructions($_, 0);
+        thumb::FixThumbInstructions($_);
     }
 
     # eabi_attributes numerical equivalents can be found in the
@@ -188,31 +106,31 @@
 
     if ($elf) {
         # REQUIRE8 Stack is required to be 8-byte aligned
-        s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
+        s/\bREQUIRE8\b/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
 
         # PRESERVE8 Stack 8-byte align is preserved
-        s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+        s/\bPRESERVE8\b/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
     } else {
-        s/\sREQUIRE8//;
-        s/\sPRESERVE8//;
+        s/\bREQUIRE8\b//;
+        s/\bPRESERVE8\b//;
     }
 
     # Use PROC and ENDP to give the symbols a .size directive.
     # This makes them show up properly in debugging tools like gdb and valgrind.
-    if (/\bPROC\b/)
-    {
+    if (/\bPROC\b/) {
         my $proc;
-        /^_([\.0-9A-Z_a-z]\w+)\b/;
+        # Match the function name so it can be stored in $proc
+        /^([\.0-9A-Z_a-z]\w+)\b/;
         $proc = $1;
         push(@proc_stack, $proc) if ($proc);
         s/\bPROC\b/@ $&/;
     }
-    if (/\bENDP\b/)
-    {
+
+    if (/\bENDP\b/) {
         my $proc;
         s/\bENDP\b/@ $&/;
         $proc = pop(@proc_stack);
-        $_ = "\t.size $proc, .-$proc".$_ if ($proc and $elf);
+        $_ = ".size $proc, .-$proc".$_ if ($proc and $elf);
     }
 
     # EQU directive
@@ -220,19 +138,20 @@
 
     # Begin macro definition
     if (/\bMACRO\b/) {
+        # Process next line down, which will be the macro definition
         $_ = <STDIN>;
         s/^/.macro/;
-        s/\$//g;                # remove formal param reference
-        s/;/@/g;                # change comment characters
+        s/\$//g;             # Remove $ from the variables in the declaration
     }
 
-    # For macros, use \ to reference formal params
-    s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    s/\$/\\/g;               # Use \ to reference formal parameters
+    # End macro definition
+
+    s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
+    s/[ \t]+$//;
     print;
-    print "$comment_sub$comment\n" if defined $comment;
 }
 
 # Mark that this object doesn't need an executable stack.
-printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n") if $elf;
+printf ("    .section .note.GNU-stack,\"\",\%\%progbits\n") if $elf;
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 848872fa7d..62491c1918 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -20,17 +20,14 @@
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas_apple.pl script.\n\n";
-print "\t.syntax unified\n";
+print ".syntax unified\n";
 
-my %register_aliases;
 my %macro_aliases;
 
 my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8", "\$9");
 
 my @incoming_array;
 
-my @imported_functions;
-
 # Perl trim function to remove whitespace from the start and end of the string
 sub trim($)
 {
@@ -46,25 +43,7 @@ ($)
     s/@/,:/g;
 
     # Comment character
-    s/;/ @/g;
-
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
+    s/;/@/;
 
     # Convert ELSE to .else
     s/\bELSE\b/.else/g;
@@ -72,131 +51,64 @@ ($)
     # Convert ENDIF to .endif
     s/\bENDIF\b/.endif/g;
 
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
     # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
-        s/=/==/g;
+    if (s/\bIF\b/.if/g) {
+        s/=+/==/g;
     }
 
     # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
-
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
+    s/INCLUDE\s?(.*)$/.include \"$1\"/;
 
     # No AREA required
     # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/;
     # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
-
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
+    s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/;
 
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
+    # Make function visible to linker.
+    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1/;
 
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
+    # No vertical bars on function names
+    s/^\|(\$?\w+)\|/$1/g;
 
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
-    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
+    # Labels and functions need a leading underscore and trailing colon
+    s/^([a-zA-Z_0-9\$]+)/_$1:/ if !/EQU/;
 
-    # Prepend imported functions with _
-    if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
-    {
-        $function = trim($1);
-        push(@imported_functions, $function);
-    }
-
-    foreach $function (@imported_functions)
-    {
-        s/$function/_$function/;
-    }
-
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
-
-    # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
-    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
+    # Branches need to call the correct, underscored, function
+    s/^(\s+b[egln]?[teq]?\s+)([a-zA-Z_0-9\$]+)/$1 _$2/ if !/EQU/;
 
     # ALIGN directive
     s/\bALIGN\b/.balign/g;
 
     # Strip ARM
-    s/\sARM/@ ARM/g;
+    s/\s+ARM//;
 
     # Strip REQUIRE8
-    #s/\sREQUIRE8/@ REQUIRE8/g;
-    s/\sREQUIRE8/@ /g;
+    s/\s+REQUIRE8//;
 
     # Strip PRESERVE8
-    s/\sPRESERVE8/@ PRESERVE8/g;
+    s/\s+PRESERVE8//;
 
     # Strip PROC and ENDPROC
-    s/\bPROC\b/@/g;
-    s/\bENDP\b/@/g;
+    s/\bPROC\b//g;
+    s/\bENDP\b//g;
 
     # EQU directive
-    s/(.*)EQU(.*)/.set $1, $2/;
+    s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/;
 
     # Begin macro definition
-    if (/\bMACRO\b/)
-    {
+    if (/\bMACRO\b/) {
         # Process next line down, which will be the macro definition
         $_ = <STDIN>;
-
-        $trimmed = trim($_);
-
-        # remove commas that are separating list
-        $trimmed =~ s/,//g;
-
-        # string to array
-        @incoming_array = split(/\s+/, $trimmed);
-
-        print ".macro @incoming_array[0]\n";
-
-        # remove the first element, as that is the name of the macro
-        shift (@incoming_array);
-
-        @macro_aliases{@incoming_array} = @mapping_list;
-
-        next;
+        s/^/.macro/;
+        s/\$//g;             # Remove $ from the variables in the declaration
     }
 
-    while (($key, $value) = each(%macro_aliases))
-    {
-        $key =~ s/\$/\\\$/;
-        s/$key\b/$value/g;
-    }
+    s/\$/\\/g;               # Use \ to reference formal parameters
+    # End macro definition
 
-    # For macros, use \ to reference formal params
-#   s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
-
+    s/[ \t]+$//;
     print;
 }
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 206b54f775..869793a296 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -262,6 +262,9 @@ if [ -z "$source_path" ] || [ "$source_path" = "." ]; then
   source_path="`pwd`"
   disable_feature source_path_used
 fi
+# Makefiles greedily process the '#' character as a comment, even if it is
+# inside quotes. So, this character must be escaped in all paths in Makefiles.
+source_path_mk=$(echo $source_path | sed -e 's;\#;\\\#;g')
 
 if test ! -z "$TMPDIR" ; then
   TMPDIRx="${TMPDIR}"
@@ -446,6 +449,17 @@ EOF
   fi
 }
 
+check_inline_asm() {
+  log check_inline_asm "$@"
+  name="$1"
+  code="$2"
+  shift 2
+  disable_feature $name
+  check_cc "$@" <<EOF && enable_feature $name
+void foo(void) { __asm__ volatile($code); }
+EOF
+}
+
 write_common_config_banner() {
   print_webm_license config.mk "##" ""
   echo '# This file automatically generated by configure. Do not edit!' >> config.mk
@@ -481,11 +495,11 @@ write_common_target_config_mk() {
 
   cat >> $1 << EOF
 # This file automatically generated by configure. Do not edit!
-SRC_PATH="$source_path"
-SRC_PATH_BARE=$source_path
+SRC_PATH="$source_path_mk"
+SRC_PATH_BARE=$source_path_mk
 BUILD_PFX=${BUILD_PFX}
 TOOLCHAIN=${toolchain}
-ASM_CONVERSION=${asm_conversion_cmd:-${source_path}/build/make/ads2gas.pl}
+ASM_CONVERSION=${asm_conversion_cmd:-${source_path_mk}/build/make/ads2gas.pl}
 GEN_VCPROJ=${gen_vcproj_cmd}
 MSVS_ARCH_DIR=${msvs_arch_dir}
 
@@ -507,6 +521,7 @@ AS_SFX    = ${AS_SFX:-.asm}
 EXE_SFX   = ${EXE_SFX}
 VCPROJ_SFX = ${VCPROJ_SFX}
 RTCD_OPTIONS = ${RTCD_OPTIONS}
+LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS}
 LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS}
 EOF
 
@@ -763,6 +778,12 @@ process_common_toolchain() {
       *mips32el*)
         tgt_isa=mips32
         ;;
+      loongarch32*)
+        tgt_isa=loongarch32
+        ;;
+      loongarch64*)
+        tgt_isa=loongarch64
+        ;;
     esac
 
     # detect tgt_os
@@ -771,6 +792,10 @@ process_common_toolchain() {
         tgt_isa=x86_64
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'`
         ;;
+      *darwin2[0-3]*)
+        tgt_isa=`uname -m`
+        tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'`
+        ;;
       x86_64*mingw32*)
         tgt_os=win64
         ;;
@@ -818,6 +843,10 @@ process_common_toolchain() {
 
   # Enable the architecture family
   case ${tgt_isa} in
+    arm64 | armv8)
+      enable_feature arm
+      enable_feature aarch64
+      ;;
     arm*)
       enable_feature arm
       ;;
@@ -827,10 +856,21 @@ process_common_toolchain() {
     ppc*)
       enable_feature ppc
       ;;
+    loongarch*)
+      soft_enable lsx
+      soft_enable lasx
+      enable_feature loongarch
+      ;;
   esac
 
-  # PIC is probably what we want when building shared libs
+  # Position independent code (PIC) is probably what we want when building
+  # shared libs or position independent executable (PIE) targets.
   enabled shared && soft_enable pic
+  check_cpp << EOF || soft_enable pic
+#if !(__pie__ || __PIE__)
+#error Neither __pie__ or __PIE__ are set
+#endif
+EOF
 
   # Minimum iOS version for all target platforms (darwin and iphonesimulator).
   # Shared library framework builds are only possible on iOS 8 and later.
@@ -845,7 +885,7 @@ process_common_toolchain() {
   # Handle darwin variants. Newer SDKs allow targeting older
   # platforms, so use the newest one available.
   case ${toolchain} in
-    arm*-darwin*)
+    arm*-darwin-*)
       add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
       iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)"
       if [ -d "${iphoneos_sdk_dir}" ]; then
@@ -853,7 +893,7 @@ process_common_toolchain() {
         add_ldflags "-isysroot ${iphoneos_sdk_dir}"
       fi
       ;;
-    x86*-darwin*)
+    *-darwin*)
       osx_sdk_dir="$(show_darwin_sdk_path macosx)"
       if [ -d "${osx_sdk_dir}" ]; then
         add_cflags  "-isysroot ${osx_sdk_dir}"
@@ -911,6 +951,10 @@ process_common_toolchain() {
       add_cflags  "-mmacosx-version-min=10.15"
       add_ldflags "-mmacosx-version-min=10.15"
       ;;
+    *-darwin2[0-2]-*)
+      add_cflags  "-arch ${toolchain%%-*}"
+      add_ldflags "-arch ${toolchain%%-*}"
+      ;;
     *-iphonesimulator-*)
       add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
       add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
@@ -932,13 +976,26 @@ process_common_toolchain() {
       ;;
   esac
 
-  # Process ARM architecture variants
+  # Process architecture variants
   case ${toolchain} in
     arm*)
-      # on arm, isa versions are supersets
+      soft_enable runtime_cpu_detect
+      # Arm ISA extensions are treated as supersets.
       case ${tgt_isa} in
         arm64|armv8)
-          soft_enable neon
+          for ext in ${ARCH_EXT_LIST_AARCH64}; do
+            # Disable higher order extensions to simplify dependencies.
+            if [ "$disable_exts" = "yes" ]; then
+              if ! disabled $ext; then
+                RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+                disable_feature $ext
+              fi
+            elif disabled $ext; then
+              disable_exts="yes"
+            else
+              soft_enable $ext
+            fi
+          done
           ;;
         armv7|armv7s)
           soft_enable neon
@@ -984,7 +1041,7 @@ EOF
           fi
 
           enabled debug && add_asflags -g
-          asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
+          asm_conversion_cmd="${source_path_mk}/build/make/ads2gas.pl"
 
           case ${tgt_os} in
             win*)
@@ -1006,7 +1063,7 @@ EOF
           # respective SDKs' limitations. Fortunately, these are all 32-bit ABIs
           # and so can be selected as 'win32'.
           if [ ${tgt_os} = "win32" ]; then
-            asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
+            asm_conversion_cmd="${source_path_mk}/build/make/ads2armasm_ms.pl"
             AS_SFX=.S
             msvs_arch_dir=arm-msvs
             disable_feature multithread
@@ -1033,8 +1090,11 @@ EOF
                     enable_feature win_arm64_neon_h_workaround
               else
                 # If a probe is not possible, assume this is the pure Windows
-                # SDK and so the workaround is necessary.
-                enable_feature win_arm64_neon_h_workaround
+                # SDK and so the workaround is necessary when using Visual
+                # Studio < 2019.
+                if [ ${tgt_cc##vs} -lt 16 ]; then
+                  enable_feature win_arm64_neon_h_workaround
+                fi
               fi
             fi
           fi
@@ -1084,7 +1144,7 @@ EOF
           soft_enable unit_tests
           ;;
 
-        darwin*)
+        darwin)
           if ! enabled external_build; then
             XCRUN_FIND="xcrun --sdk iphoneos --find"
             CXX="$(${XCRUN_FIND} clang++)"
@@ -1141,7 +1201,7 @@ EOF
             fi
           fi
 
-          asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
+          asm_conversion_cmd="${source_path_mk}/build/make/ads2gas_apple.pl"
           ;;
 
         linux*)
@@ -1195,25 +1255,27 @@ EOF
             check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
             check_add_ldflags -mips64r6 -mabi=64 -mfp64
             ;;
+          loongson3*)
+            check_cflags -march=loongson3a && soft_enable mmi \
+              || disable_feature mmi
+            check_cflags -mmsa && soft_enable msa \
+              || disable_feature msa
+            tgt_isa=loongson3a
+            ;;
         esac
 
+        if enabled mmi || enabled msa; then
+          soft_enable runtime_cpu_detect
+        fi
+
         if enabled msa; then
           # TODO(libyuv:793)
           # The new mips functions in libyuv do not build
           # with the toolchains we currently use for testing.
           soft_disable libyuv
-
-          add_cflags -mmsa
-          add_asflags -mmsa
-          add_ldflags -mmsa
         fi
       fi
 
-      if enabled mmi; then
-        tgt_isa=loongson3a
-        check_add_ldflags -march=loongson3a
-      fi
-
       check_add_cflags -march=${tgt_isa}
       check_add_asflags -march=${tgt_isa}
       check_add_asflags -KPIC
@@ -1283,10 +1345,6 @@ EOF
           enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
           ;;
         vs*)
-          # When building with Microsoft Visual Studio the assembler is
-          # invoked directly. Checking at configure time is unnecessary.
-          # Skip the check by setting AS arbitrarily
-          AS=msvs
           msvs_arch_dir=x86-msvs
           case ${tgt_cc##vs} in
             14)
@@ -1410,6 +1468,15 @@ EOF
           ;;
       esac
       ;;
+    loongarch*)
+      link_with_cc=gcc
+      setup_gnu_toolchain
+
+      enabled lsx && check_inline_asm lsx '"vadd.b $vr0, $vr1, $vr1"'
+      enabled lsx && soft_enable runtime_cpu_detect
+      enabled lasx && check_inline_asm lasx '"xvadd.b $xr0, $xr1, $xr1"'
+      enabled lasx && soft_enable runtime_cpu_detect
+      ;;
     *-gcc|generic-gnu)
       link_with_cc=gcc
       enable_feature gcc
@@ -1471,7 +1538,7 @@ EOF
 
     # Try to find which inline keywords are supported
     check_cc <<EOF && INLINE="inline"
-static inline function() {}
+static inline int function(void) {}
 EOF
 
   # Almost every platform uses pthreads.
@@ -1512,6 +1579,22 @@ EOF
       ;;
   esac
 
+  # only for LOONGARCH platforms
+  case ${toolchain} in
+    loongarch*)
+      if enabled big_endian; then
+        if enabled lsx; then
+          echo "lsx optimizations are available only for little endian platforms"
+          disable_feature lsx
+        fi
+        if enabled lasx; then
+          echo "lasx optimizations are available only for little endian platforms"
+          disable_feature lasx
+        fi
+      fi
+      ;;
+  esac
+
   # glibc needs these
   if enabled linux; then
     add_cflags -D_LARGEFILE_SOURCE
diff --git a/build/make/gen_asm_deps.sh b/build/make/gen_asm_deps.sh
index 6a7bff9ebc..3bd4d125f1 100755
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh
@@ -42,7 +42,7 @@ done
 
 [ -n "$srcfile" ] || show_help
 sfx=${sfx:-asm}
-includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
+includes=$(LC_ALL=C grep -E -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
            perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;")
 #" restore editor state
 for inc in ${includes}; do
diff --git a/build/make/gen_msvs_sln.sh b/build/make/gen_msvs_sln.sh
index d1adfd749c..0b312850fe 100755
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -25,7 +25,7 @@ files.
 Options:
     --help                      Print this message
     --out=outfile               Redirect output to a file
-    --ver=version               Version (14-16) of visual studio to generate for
+    --ver=version               Version (14-17) of visual studio to generate for
     --target=isa-os-cc          Target specifier
 EOF
     exit 1
@@ -219,6 +219,7 @@ for opt in "$@"; do
         14) vs_year=2015 ;;
         15) vs_year=2017 ;;
         16) vs_year=2019 ;;
+        17) vs_year=2022 ;;
         *) die Unrecognized Visual Studio Version in $opt ;;
       esac
     ;;
@@ -232,7 +233,7 @@ done
 outfile=${outfile:-/dev/stdout}
 mkoutfile=${mkoutfile:-/dev/stdout}
 case "${vs_ver}" in
-    1[4-6])
+    1[4-7])
       # VS has used Format Version 12.00 continuously since vs11.
       sln_vers="12.00"
       sln_vers_str="Visual Studio ${vs_year}"
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index bb1c31d230..1e1db05bb2 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -141,7 +141,17 @@ for opt in "$@"; do
     case "$opt" in
         --help|-h) show_help
         ;;
-        --target=*) target="${optval}"
+        --target=*)
+            target="${optval}"
+            platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}')
+            case "$platform_toolset" in
+                clangcl) platform_toolset="ClangCl"
+                ;;
+                "")
+                ;;
+                *) die Unrecognized Visual Studio Platform Toolset in $opt
+                ;;
+            esac
         ;;
         --out=*) outfile="$optval"
         ;;
@@ -157,6 +167,8 @@ for opt in "$@"; do
         ;;
         --lib) proj_kind="lib"
         ;;
+        --as=*) as="${optval}"
+        ;;
         --src-path-bare=*)
             src_path_bare=$(fix_path "$optval")
             src_path_bare=${src_path_bare%/}
@@ -168,7 +180,7 @@ for opt in "$@"; do
         --ver=*)
             vs_ver="$optval"
             case "$optval" in
-                1[4-6])
+                1[4-7])
                 ;;
                 *) die Unrecognized Visual Studio Version in $opt
                 ;;
@@ -247,16 +259,20 @@ libs=${libs// /;}
 case "$target" in
     x86_64*)
         platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="${as} -Xvc -gcv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="${as} -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     x86*)
         platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="${as} -Xvc -gcv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="${as} -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     arm64*)
         platforms[0]="ARM64"
+        # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC.
+        if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then
+            platforms[1]="ARM64EC"
+        fi
         asm_Debug_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
         asm_Release_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
     ;;
@@ -333,14 +349,21 @@ generate_vcxproj() {
             else
                 tag_content ConfigurationType StaticLibrary
             fi
-            if [ "$vs_ver" = "14" ]; then
-                tag_content PlatformToolset v140
-            fi
-            if [ "$vs_ver" = "15" ]; then
-                tag_content PlatformToolset v141
-            fi
-            if [ "$vs_ver" = "16" ]; then
-                tag_content PlatformToolset v142
+            if [ -n "$platform_toolset" ]; then
+                tag_content PlatformToolset "$platform_toolset"
+            else
+                if [ "$vs_ver" = "14" ]; then
+                    tag_content PlatformToolset v140
+                fi
+                if [ "$vs_ver" = "15" ]; then
+                    tag_content PlatformToolset v141
+                fi
+                if [ "$vs_ver" = "16" ]; then
+                    tag_content PlatformToolset v142
+                fi
+                if [ "$vs_ver" = "17" ]; then
+                    tag_content PlatformToolset v143
+                fi
             fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh
index 27ddf7fd91..3989fec0d5 100644
--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@@ -9,7 +9,8 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
-if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
+shell_name="$(uname -o 2>/dev/null)"
+if [[ "$shell_name" = "Cygwin" || "$shell_name" = "Msys" ]] \
    && cygpath --help >/dev/null 2>&1; then
     FIXPATH='cygpath -m'
 else
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 7483200411..0b9e16738e 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -315,14 +315,26 @@ ()
 
 sub mips() {
   determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
   common_top;
 
   print <<EOF;
 #include "vpx_config.h"
 
 #ifdef RTCD_C
+#include "vpx_ports/mips.h"
 static void setup_rtcd_internal(void)
 {
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
 EOF
 
   set_function_pointers("c", @ALL_ARCHS);
@@ -375,6 +387,37 @@ ()
   common_bottom;
 }
 
+sub loongarch() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/loongarch.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = loongarch_cpu_caps();
+
+    (void)flags;
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
 sub unoptimized() {
   determine_indirection "c";
   common_top;
@@ -410,35 +453,50 @@ ()
   &require(@REQUIRES);
   x86;
 } elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
+  my $have_dspr2 = 0;
+  my $have_msa = 0;
+  my $have_mmi = 0;
   @ALL_ARCHS = filter("$opts{arch}");
   open CONFIG_FILE, $opts{config} or
     die "Error opening config file '$opts{config}': $!\n";
   while (<CONFIG_FILE>) {
     if (/HAVE_DSPR2=yes/) {
-      @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
-      last;
+      $have_dspr2 = 1;
     }
     if (/HAVE_MSA=yes/) {
-      @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
-      last;
+      $have_msa = 1;
     }
     if (/HAVE_MMI=yes/) {
-      @ALL_ARCHS = filter("$opts{arch}", qw/mmi/);
-      last;
+      $have_mmi = 1;
     }
   }
   close CONFIG_FILE;
+  if ($have_dspr2 == 1) {
+    @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
+  } elsif ($have_msa == 1 && $have_mmi == 1) {
+    @ALL_ARCHS = filter("$opts{arch}", qw/mmi msa/);
+  } elsif ($have_msa == 1) {
+    @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
+  } elsif ($have_mmi == 1) {
+    @ALL_ARCHS = filter("$opts{arch}", qw/mmi/);
+  } else {
+    unoptimized;
+  }
   mips;
 } elsif ($opts{arch} =~ /armv7\w?/) {
   @ALL_ARCHS = filter(qw/neon_asm neon/);
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon/);
-  &require("neon");
+  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/);
+  @REQUIRES = filter(qw/neon/);
+  &require(@REQUIRES);
   arm;
 } elsif ($opts{arch} =~ /^ppc/ ) {
   @ALL_ARCHS = filter(qw/vsx/);
   ppc;
+} elsif ($opts{arch} =~ /loongarch/ ) {
+  @ALL_ARCHS = filter(qw/lsx lasx/);
+  loongarch;
 } else {
   unoptimized;
 }
diff --git a/build/make/thumb.pm b/build/make/thumb.pm
index 9c49e2d8b7..ef4b316771 100644
--- a/build/make/thumb.pm
+++ b/build/make/thumb.pm
@@ -11,11 +11,8 @@
 
 package thumb;
 
-sub FixThumbInstructions($$)
+sub FixThumbInstructions($)
 {
-    my $short_branches = $_[1];
-    my $branch_shift_offset = $short_branches ? 1 : 0;
-
     # Write additions with shifts, such as "add r10, r11, lsl #8",
     # in three operand form, "add r10, r10, r11, lsl #8".
     s/(add\s+)(r\d+),\s*(r\d+),\s*(lsl #\d+)/$1$2, $2, $3, $4/g;
diff --git a/configure b/configure
index 32272ce36f..b212e0709d 100755
--- a/configure
+++ b/configure
@@ -99,9 +99,17 @@ EOF
 # alphabetically by architecture, generic-gnu last.
 all_platforms="${all_platforms} arm64-android-gcc"
 all_platforms="${all_platforms} arm64-darwin-gcc"
+all_platforms="${all_platforms} arm64-darwin20-gcc"
+all_platforms="${all_platforms} arm64-darwin21-gcc"
+all_platforms="${all_platforms} arm64-darwin22-gcc"
+all_platforms="${all_platforms} arm64-darwin23-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
+all_platforms="${all_platforms} arm64-win64-vs16"
+all_platforms="${all_platforms} arm64-win64-vs16-clangcl"
+all_platforms="${all_platforms} arm64-win64-vs17"
+all_platforms="${all_platforms} arm64-win64-vs17-clangcl"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -110,8 +118,12 @@ all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-win32-gcc"
 all_platforms="${all_platforms} armv7-win32-vs14"
 all_platforms="${all_platforms} armv7-win32-vs15"
+all_platforms="${all_platforms} armv7-win32-vs16"
+all_platforms="${all_platforms} armv7-win32-vs17"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
+all_platforms="${all_platforms} loongarch32-linux-gcc"
+all_platforms="${all_platforms} loongarch64-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
 all_platforms="${all_platforms} ppc64le-linux-gcc"
@@ -138,6 +150,7 @@ all_platforms="${all_platforms} x86-win32-gcc"
 all_platforms="${all_platforms} x86-win32-vs14"
 all_platforms="${all_platforms} x86-win32-vs15"
 all_platforms="${all_platforms} x86-win32-vs16"
+all_platforms="${all_platforms} x86-win32-vs17"
 all_platforms="${all_platforms} x86_64-android-gcc"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
@@ -150,6 +163,10 @@ all_platforms="${all_platforms} x86_64-darwin16-gcc"
 all_platforms="${all_platforms} x86_64-darwin17-gcc"
 all_platforms="${all_platforms} x86_64-darwin18-gcc"
 all_platforms="${all_platforms} x86_64-darwin19-gcc"
+all_platforms="${all_platforms} x86_64-darwin20-gcc"
+all_platforms="${all_platforms} x86_64-darwin21-gcc"
+all_platforms="${all_platforms} x86_64-darwin22-gcc"
+all_platforms="${all_platforms} x86_64-darwin23-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -158,6 +175,7 @@ all_platforms="${all_platforms} x86_64-win64-gcc"
 all_platforms="${all_platforms} x86_64-win64-vs14"
 all_platforms="${all_platforms} x86_64-win64-vs15"
 all_platforms="${all_platforms} x86_64-win64-vs16"
+all_platforms="${all_platforms} x86_64-win64-vs17"
 all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
@@ -229,11 +247,21 @@ CODEC_FAMILIES="
 
 ARCH_LIST="
     arm
+    aarch64
     mips
     x86
     x86_64
     ppc
+    loongarch
 "
+
+ARCH_EXT_LIST_AARCH64="
+    neon
+    neon_dotprod
+    neon_i8mm
+    sve
+"
+
 ARCH_EXT_LIST_X86="
     mmx
     sse
@@ -248,11 +276,13 @@ ARCH_EXT_LIST_X86="
 
 ARCH_EXT_LIST_LOONGSON="
     mmi
+    lsx
+    lasx
 "
 
 ARCH_EXT_LIST="
-    neon
     neon_asm
+    ${ARCH_EXT_LIST_AARCH64}
 
     mips32
     dspr2
@@ -276,6 +306,7 @@ EXPERIMENT_LIST="
     emulate_hardware
     non_greedy_mv
     rate_ctrl
+    collect_component_timing
 "
 CONFIG_LIST="
     dependency_tracking
@@ -325,7 +356,6 @@ CONFIG_LIST="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
-    consistent_recode
     coefficient_range_checking
     vp9_highbitdepth
     better_hw_compatibility
@@ -389,7 +419,6 @@ CMDLINE_SELECT="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
-    consistent_recode
     coefficient_range_checking
     better_hw_compatibility
     vp9_highbitdepth
@@ -616,9 +645,11 @@ process_toolchain() {
     if enabled gcc; then
         enabled werror && check_add_cflags -Werror
         check_add_cflags -Wall
-        check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
+        check_add_cflags -Wextra-semi
+        check_add_cflags -Wextra-semi-stmt
         check_add_cflags -Wfloat-conversion
+        check_add_cflags -Wformat=2
         check_add_cflags -Wparentheses-equality
         check_add_cflags -Wpointer-arith
         check_add_cflags -Wtype-limits
@@ -627,8 +658,9 @@ process_toolchain() {
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wmissing-declarations
         check_add_cflags -Wmissing-prototypes
+        check_add_cflags -Wshadow
         check_add_cflags -Wuninitialized
-        check_add_cflags -Wunreachable-code-loop-increment
+        check_add_cflags -Wunreachable-code-aggressive
         check_add_cflags -Wunused
         check_add_cflags -Wextra
         # check_add_cflags also adds to cxxflags. gtest does not do well with
@@ -639,9 +671,8 @@ process_toolchain() {
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
         fi
-        # Enforce c89 for c files. Don't be too strict about it though. Allow
-        # gnu extensions like "//" for comments.
-        check_cflags -std=gnu89 && add_cflags_only -std=gnu89
+        # Enforce C99 for C files. Allow GNU extensions.
+        check_cflags -std=gnu99 && add_cflags_only -std=gnu99
         # Avoid this warning for third_party C++ sources. Some reorganization
         # would be needed to apply this only to test/*.cc.
         check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
@@ -652,11 +683,22 @@ process_toolchain() {
           check_add_cxxflags -Wno-psabi
         fi
 
-        # disable some warnings specific to libyuv.
+        # Enforce C++11 compatibility.
+        check_add_cxxflags -Wc++14-extensions
+        check_add_cxxflags -Wc++17-extensions
+        check_add_cxxflags -Wc++20-extensions
+        check_add_cxxflags -Wnon-virtual-dtor
+
+        # disable some warnings specific to libyuv / libwebm.
         check_cxxflags -Wno-missing-declarations \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
         check_cxxflags -Wno-missing-prototypes \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
+        check_cxxflags -Wno-pass-failed \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed"
+        check_cxxflags -Wno-shadow \
+          && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow"
         check_cxxflags -Wno-unused-parameter \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
     fi
@@ -729,33 +771,33 @@ process_toolchain() {
             soft_enable libyuv
         ;;
         *-android-*)
-            check_add_cxxflags -std=c++11 && soft_enable webm_io
+            check_add_cxxflags -std=gnu++11 && soft_enable webm_io
             soft_enable libyuv
             # GTestLog must be modified to use Android logging utilities.
         ;;
         *-darwin-*)
-            check_add_cxxflags -std=c++11
+            check_add_cxxflags -std=gnu++11
             # iOS/ARM builds do not work with gtest. This does not match
             # x86 targets.
         ;;
         *-iphonesimulator-*)
-            check_add_cxxflags -std=c++11 && soft_enable webm_io
+            check_add_cxxflags -std=gnu++11 && soft_enable webm_io
             soft_enable libyuv
         ;;
         *-win*)
             # Some mingw toolchains don't have pthread available by default.
             # Treat these more like visual studio where threading in gtest
             # would be disabled for the same reason.
-            check_add_cxxflags -std=c++11 && soft_enable unit_tests \
+            check_add_cxxflags -std=gnu++11 && soft_enable unit_tests \
               && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
         ;;
         *)
-            enabled pthread_h && check_add_cxxflags -std=c++11 \
+            enabled pthread_h && check_add_cxxflags -std=gnu++11 \
               && soft_enable unit_tests
-            check_add_cxxflags -std=c++11 && soft_enable webm_io
+            check_add_cxxflags -std=gnu++11 && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
diff --git a/examples.mk b/examples.mk
index a28e529359..22726a3d41 100644
--- a/examples.mk
+++ b/examples.mk
@@ -57,6 +57,7 @@ LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \
 # Add compile flags and include path for libwebm sources.
 ifeq ($(CONFIG_WEBM_IO),yes)
   CXXFLAGS     += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+  $(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
   INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm
 endif
 
@@ -81,8 +82,6 @@ ifeq ($(CONFIG_LIBYUV),yes)
   $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS}
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
-  vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
-  vpxdec.SRCS                 += $(LIBWEBM_MUXER_SRCS)
   vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
   vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif
@@ -376,6 +375,7 @@ $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
             --ver=$$(CONFIG_VS_VERSION)\
             --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$$(AS) \
             $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
             $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^
diff --git a/examples/postproc.c b/examples/postproc.c
index be999b429e..b53c15ea15 100644
--- a/examples/postproc.c
+++ b/examples/postproc.c
@@ -109,7 +109,7 @@ int main(int argc, char **argv) {
                                 0 };
       if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
         die_codec(&codec, "Failed to turn on postproc.");
-    };
+    }
 
     // Decode the frame with 15ms deadline
     if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000))
diff --git a/examples/resize_util.c b/examples/resize_util.c
index 7e529b2e20..5fb63e1660 100644
--- a/examples/resize_util.c
+++ b/examples/resize_util.c
@@ -52,6 +52,7 @@ int main(int argc, char *argv[]) {
   uint8_t *inbuf_v, *outbuf_v;
   int f, frames;
   int width, height, target_width, target_height;
+  int failed = 0;
 
   exec_name = argv[0];
 
@@ -82,6 +83,7 @@ int main(int argc, char *argv[]) {
   }
   fpout = fopen(fout, "wb");
   if (fpout == NULL) {
+    fclose(fpin);
     printf("Can't open file %s to write\n", fout);
     usage();
     return 1;
@@ -100,6 +102,11 @@ int main(int argc, char *argv[]) {
 
   inbuf = (uint8_t *)malloc(width * height * 3 / 2);
   outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
+  if (!(inbuf && outbuf)) {
+    printf("Failed to allocate buffers.\n");
+    failed = 1;
+    goto Error;
+  }
   inbuf_u = inbuf + width * height;
   inbuf_v = inbuf_u + width * height / 4;
   outbuf_u = outbuf + target_width * target_height;
@@ -114,10 +121,11 @@ int main(int argc, char *argv[]) {
     f++;
   }
   printf("%d frames processed\n", f);
+Error:
   fclose(fpin);
   fclose(fpout);
 
   free(inbuf);
   free(outbuf);
-  return 0;
+  return failed;
 }
diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c
index a73ee8ed66..1dd731765c 100644
--- a/examples/svc_encodeframe.c
+++ b/examples/svc_encodeframe.c
@@ -21,6 +21,7 @@
 #include <stdlib.h>
 #include <string.h>
 #define VPX_DISABLE_CTRL_TYPECHECKS 1
+#include "../tools_common.h"
 #include "./vpx_config.h"
 #include "./svc_context.h"
 #include "vpx/vp8cx.h"
@@ -95,8 +96,9 @@ static const SvcInternal_t *get_const_svc_internal(const SvcContext *svc_ctx) {
   return (const SvcInternal_t *)svc_ctx->internal;
 }
 
-static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
-                   ...) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 4) int svc_log(SvcContext *svc_ctx,
+                                                 SVC_LOG_LEVEL level,
+                                                 const char *fmt, ...) {
   char buf[512];
   int retval = 0;
   va_list ap;
@@ -264,7 +266,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
   if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
             "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
-            "enabled auto alt reference frame, but % layers are enabled\n",
+            "enabled auto alt reference frame, but %d layers are enabled\n",
             REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled);
     res = VPX_CODEC_INVALID_PARAM;
   }
@@ -277,7 +279,7 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
   if (svc_ctx == NULL || options == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  strncpy(si->options, options, sizeof(si->options));
+  strncpy(si->options, options, sizeof(si->options) - 1);
   si->options[sizeof(si->options) - 1] = '\0';
   return VPX_CODEC_OK;
 }
@@ -379,7 +381,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
                              vpx_codec_iface_t *iface,
                              vpx_codec_enc_cfg_t *enc_cfg) {
   vpx_codec_err_t res;
-  int i, sl, tl;
+  int sl, tl;
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
       enc_cfg == NULL) {
@@ -431,7 +433,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   }
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-      i = sl * svc_ctx->temporal_layers + tl;
+      const int i = sl * svc_ctx->temporal_layers + tl;
       si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
       si->svc_params.min_quantizers[i] = 0;
       if (enc_cfg->rc_end_usage == VPX_CBR &&
@@ -456,10 +458,11 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
 
   if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "spatial layers * temporal layers exceeds the maximum number of "
-            "allowed layers of %d\n",
-            svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS);
+    svc_log(
+        svc_ctx, SVC_LOG_ERROR,
+        "spatial layers * temporal layers (%d) exceeds the maximum number of "
+        "allowed layers of %d\n",
+        svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS);
     return VPX_CODEC_INVALID_PARAM;
   }
   res = assign_layer_bitrates(svc_ctx, enc_cfg);
@@ -500,7 +503,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
 
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-      i = sl * svc_ctx->temporal_layers + tl;
+      const int i = sl * svc_ctx->temporal_layers + tl;
       if (enc_cfg->rc_end_usage == VPX_CBR &&
           enc_cfg->g_pass == VPX_RC_ONE_PASS) {
         si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer;
@@ -549,11 +552,8 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   iter = NULL;
   while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
     switch (cx_pkt->kind) {
-      case VPX_CODEC_PSNR_PKT: {
-      }
-        ++si->psnr_pkt_received;
-        break;
-      default: { break; }
+      case VPX_CODEC_PSNR_PKT: ++si->psnr_pkt_received; break;
+      default: break;
     }
   }
 
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 3d950b2c4b..07a10d9cf3 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -84,6 +84,7 @@ static int get_frame_stats(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
       const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
       const size_t pkt_size = pkt->data.twopass_stats.sz;
       stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      if (!stats->buf) die("Failed to reallocate stats buffer.");
       memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
       stats->sz += pkt_size;
     }
@@ -221,7 +222,7 @@ int main(int argc, char **argv) {
     die("Invalid frame size: %dx%d", w, h);
 
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
-    die("Failed to allocate image", w, h);
+    die("Failed to allocate image (%dx%d)", w, h);
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index e72f8a0197..62d96de557 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -352,7 +352,7 @@ int main(int argc, char **argv) {
   framerate = (int)strtol(argv[3], NULL, 0);
 
   if (width < 16 || width % 2 || height < 16 || height % 2)
-    die("Invalid resolution: %ldx%ld", width, height);
+    die("Invalid resolution: %dx%d", width, height);
 
   /* Open input video file for encoding */
   if (!(infile = fopen(argv[4], "rb")))
@@ -380,7 +380,7 @@ int main(int argc, char **argv) {
         (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
     if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
       die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
-          num_temporal_layers);
+          num_temporal_layers[i]);
   }
 
   /* Open file to write out each spatially downsampled input stream. */
@@ -468,7 +468,7 @@ int main(int argc, char **argv) {
   /* Allocate image for each encoder */
   for (i = 0; i < NUM_ENCODERS; i++)
     if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
-      die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+      die("Failed to allocate image (%dx%d)", cfg[i].g_w, cfg[i].g_h);
 
   if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w)
     read_frame_p = mulres_read_frame;
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index b987989a86..998e4fb20d 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -32,6 +32,7 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "./y4minput.h"
 
+#define OUTPUT_FRAME_STATS 0
 #define OUTPUT_RC_STATS 1
 
 #define SIMULCAST_MODE 0
@@ -66,12 +67,6 @@ static const arg_def_t kf_dist_arg =
     ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
 static const arg_def_t scale_factors_arg =
     ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
-static const arg_def_t passes_arg =
-    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
-static const arg_def_t pass_arg =
-    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
-static const arg_def_t fpf_name_arg =
-    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
 static const arg_def_t min_q_arg =
     ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
 static const arg_def_t max_q_arg =
@@ -125,9 +120,6 @@ static const arg_def_t *svc_args[] = { &frames_arg,
                                        &spatial_layers_arg,
                                        &kf_dist_arg,
                                        &scale_factors_arg,
-                                       &passes_arg,
-                                       &pass_arg,
-                                       &fpf_name_arg,
                                        &min_q_arg,
                                        &max_q_arg,
                                        &min_bitrate_arg,
@@ -173,8 +165,6 @@ typedef struct {
   uint32_t frames_to_skip;
   struct VpxInputContext input_ctx;
   stats_io_t rc_stats;
-  int passes;
-  int pass;
   int tune_content;
   int inter_layer_pred;
 } AppInput;
@@ -197,9 +187,6 @@ static void parse_command_line(int argc, const char **argv_,
   char **argi = NULL;
   char **argj = NULL;
   vpx_codec_err_t res;
-  int passes = 0;
-  int pass = 0;
-  const char *fpf_file_name = NULL;
   unsigned int min_bitrate = 0;
   unsigned int max_bitrate = 0;
   char string_options[1024] = { 0 };
@@ -236,6 +223,10 @@ static void parse_command_line(int argc, const char **argv_,
 
   // process command line options
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    exit(EXIT_FAILURE);
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
@@ -289,18 +280,6 @@ static void parse_command_line(int argc, const char **argv_,
               sizeof(string_options) - strlen(string_options) - 1);
       strncat(string_options, arg.val,
               sizeof(string_options) - strlen(string_options) - 1);
-    } else if (arg_match(&arg, &passes_arg, argi)) {
-      passes = arg_parse_uint(&arg);
-      if (passes < 1 || passes > 2) {
-        die("Error: Invalid number of passes (%d)\n", passes);
-      }
-    } else if (arg_match(&arg, &pass_arg, argi)) {
-      pass = arg_parse_uint(&arg);
-      if (pass < 1 || pass > 2) {
-        die("Error: Invalid pass selected (%d)\n", pass);
-      }
-    } else if (arg_match(&arg, &fpf_name_arg, argi)) {
-      fpf_file_name = arg.val;
     } else if (arg_match(&arg, &min_q_arg, argi)) {
       strncat(string_options, " min-quantizers=",
               sizeof(string_options) - strlen(string_options) - 1);
@@ -337,7 +316,6 @@ static void parse_command_line(int argc, const char **argv_,
           break;
         default:
           die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
-          break;
       }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) {
@@ -355,35 +333,7 @@ static void parse_command_line(int argc, const char **argv_,
   if (strlen(string_options) > 0)
     vpx_svc_set_options(svc_ctx, string_options + 1);
 
-  if (passes == 0 || passes == 1) {
-    if (pass) {
-      fprintf(stderr, "pass is ignored since there's only one pass\n");
-    }
-    enc_cfg->g_pass = VPX_RC_ONE_PASS;
-  } else {
-    if (pass == 0) {
-      die("pass must be specified when passes is 2\n");
-    }
-
-    if (fpf_file_name == NULL) {
-      die("fpf must be specified when passes is 2\n");
-    }
-
-    if (pass == 1) {
-      enc_cfg->g_pass = VPX_RC_FIRST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 0)) {
-        fatal("Failed to open statistics store");
-      }
-    } else {
-      enc_cfg->g_pass = VPX_RC_LAST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 1)) {
-        fatal("Failed to open statistics store");
-      }
-      enc_cfg->rc_twopass_stats_in = stats_get(&app_input->rc_stats);
-    }
-    app_input->passes = passes;
-    app_input->pass = pass;
-  }
+  enc_cfg->g_pass = VPX_RC_ONE_PASS;
 
   if (enc_cfg->rc_target_bitrate > 0) {
     if (min_bitrate > 0) {
@@ -411,6 +361,8 @@ static void parse_command_line(int argc, const char **argv_,
   if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) {
     enc_cfg->g_w = app_input->input_ctx.width;
     enc_cfg->g_h = app_input->input_ctx.height;
+    enc_cfg->g_timebase.den = app_input->input_ctx.framerate.numerator;
+    enc_cfg->g_timebase.num = app_input->input_ctx.framerate.denominator;
   }
 
   if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
@@ -633,7 +585,8 @@ static void set_frame_flags_bypass_mode_ex0(
       ref_frame_config->alt_fb_idx[sl] = 0;
     } else if (tl == 1) {
       ref_frame_config->lst_fb_idx[sl] = sl;
-      ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
+      ref_frame_config->gld_fb_idx[sl] =
+          (sl == 0) ? 0 : num_spatial_layers + sl - 1;
       ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
     }
     // Set the reference and update flags.
@@ -828,12 +781,15 @@ static void svc_output_rc_stats(
   vpx_codec_control(codec, VP9E_GET_SVC_LAYER_ID, layer_id);
   parse_superframe_index(cx_pkt->data.frame.buf, cx_pkt->data.frame.sz,
                          sizes_parsed, &count);
-  if (enc_cfg->ss_number_layers == 1) sizes[0] = cx_pkt->data.frame.sz;
-  for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
-    sizes[sl] = 0;
-    if (cx_pkt->data.frame.spatial_layer_encoded[sl]) {
-      sizes[sl] = sizes_parsed[num_layers_encoded];
-      num_layers_encoded++;
+  if (enc_cfg->ss_number_layers == 1) {
+    sizes[0] = cx_pkt->data.frame.sz;
+  } else {
+    for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+      sizes[sl] = 0;
+      if (cx_pkt->data.frame.spatial_layer_encoded[sl]) {
+        sizes[sl] = sizes_parsed[num_layers_encoded];
+        num_layers_encoded++;
+      }
     }
   }
   for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
@@ -924,7 +880,9 @@ int main(int argc, const char **argv) {
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
   int end_of_stream = 0;
+#if OUTPUT_FRAME_STATS
   int frames_received = 0;
+#endif
 #if OUTPUT_RC_STATS
   VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL };
   struct RateControlStats rc;
@@ -1001,13 +959,11 @@ int main(int argc, const char **argv) {
   info.time_base.numerator = enc_cfg.g_timebase.num;
   info.time_base.denominator = enc_cfg.g_timebase.den;
 
-  if (!(app_input.passes == 2 && app_input.pass == 1)) {
-    // We don't save the bitstream for the 1st pass on two pass rate control
-    writer =
-        vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info);
-    if (!writer)
-      die("Failed to open %s for writing\n", app_input.output_filename);
-  }
+  writer =
+      vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing\n", app_input.output_filename);
+
 #if OUTPUT_RC_STATS
   // Write out spatial layer stream.
   // TODO(marpan/jianj): allow for writing each spatial and temporal stream.
@@ -1050,6 +1006,9 @@ int main(int argc, const char **argv) {
 
   vpx_codec_control(&encoder, VP9E_SET_TUNE_CONTENT, app_input.tune_content);
 
+  vpx_codec_control(&encoder, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0);
+  vpx_codec_control(&encoder, VP9E_SET_DISABLE_LOOPFILTER, 0);
+
   svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP;
   for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl)
     svc_drop_frame.framedrop_thresh[sl] = enc_cfg.rc_dropframe_thresh;
@@ -1169,14 +1128,14 @@ int main(int argc, const char **argv) {
             }
 #endif
           }
-          /*
+#if OUTPUT_FRAME_STATS
           printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
                  !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
                  (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
-          */
+          ++frames_received;
+#endif
           if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
             si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
-          ++frames_received;
 #if CONFIG_VP9_DECODER && !SIMULCAST_MODE
           if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf,
                                (unsigned int)cx_pkt->data.frame.sz, NULL, 0))
@@ -1189,7 +1148,9 @@ int main(int argc, const char **argv) {
                       cx_pkt->data.twopass_stats.sz);
           break;
         }
-        default: { break; }
+        default: {
+          break;
+        }
       }
 
 #if CONFIG_VP9_DECODER && !SIMULCAST_MODE
@@ -1224,7 +1185,6 @@ int main(int argc, const char **argv) {
 #endif
   if (vpx_codec_destroy(&encoder))
     die_codec(&encoder, "Failed to destroy codec");
-  if (app_input.passes == 2) stats_close(&app_input.rc_stats, 1);
   if (writer) {
     vpx_video_writer_close(writer);
   }
diff --git a/examples/vpx_dec_fuzzer.cc b/examples/vpx_dec_fuzzer.cc
index 34c0e11238..5eba9d74da 100644
--- a/examples/vpx_dec_fuzzer.cc
+++ b/examples/vpx_dec_fuzzer.cc
@@ -41,7 +41,7 @@
    $make -j32
 
  * Build vp9 fuzzer
-   $ $CXX $CXXFLAGS -std=c++11 -DDECODER=vp9 \
+   $ $CXX $CXXFLAGS -std=gnu++11 -DDECODER=vp9 \
    -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \
    ../libvpx/examples/vpx_dec_fuzzer.cc -o ./vpx_dec_fuzzer_vp9 \
    ./libvpx.a -Wl,--end-group
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 872751cefe..a80027822a 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -30,7 +30,7 @@
 
 #define ROI_MAP 0
 
-#define zero(Dest) memset(&(Dest), 0, sizeof(Dest));
+#define zero(Dest) memset(&(Dest), 0, sizeof(Dest))
 
 static const char *exec_name;
 
@@ -240,6 +240,38 @@ static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg,
     }
   }
 }
+
+static void set_roi_skip_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi,
+                             int *skip_map, int *prev_mask_map, int frame_num) {
+  const int block_size = 8;
+  unsigned int i, j;
+  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + block_size - 1) / block_size;
+  zero(roi->skip);
+  zero(roi->delta_q);
+  zero(roi->delta_lf);
+  memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
+  roi->ref_frame[1] = 1;
+  // Use segment 3 for skip.
+  roi->skip[3] = 1;
+  roi->roi_map =
+      (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+  for (i = 0; i < roi->rows; ++i) {
+    for (j = 0; j < roi->cols; ++j) {
+      const int idx = i * roi->cols + j;
+      // Use segment 3 for skip.
+      // prev_mask_map keeps track of blocks that have been stably on segment 3
+      // for the past 10 frames. Only skip when the block is on segment 3 in
+      // both current map and prev_mask_map.
+      if (skip_map[idx] == 1 && prev_mask_map[idx] == 1) roi->roi_map[idx] = 3;
+      // Reset it every 10 frames so it doesn't propagate for too many frames.
+      if (frame_num % 10 == 0)
+        prev_mask_map[idx] = skip_map[idx];
+      else if (prev_mask_map[idx] == 1 && skip_map[idx] == 0)
+        prev_mask_map[idx] = 0;
+    }
+  }
+}
 #endif
 
 // Temporal scaling parameters:
@@ -574,6 +606,23 @@ static void set_temporal_layer_pattern(int layering_mode,
   }
 }
 
+#if ROI_MAP
+static void read_mask(FILE *mask_file, int *seg_map) {
+  int mask_rows, mask_cols, i, j;
+  int *map_start = seg_map;
+  fscanf(mask_file, "%d %d\n", &mask_cols, &mask_rows);
+  for (i = 0; i < mask_rows; i++) {
+    for (j = 0; j < mask_cols; j++) {
+      fscanf(mask_file, "%d ", &seg_map[j]);
+      // reverse the bit
+      seg_map[j] = 1 - seg_map[j];
+    }
+    seg_map += mask_cols;
+  }
+  seg_map = map_start;
+}
+#endif
+
 int main(int argc, char **argv) {
   VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL };
   vpx_codec_ctx_t codec;
@@ -613,7 +662,14 @@ int main(int argc, char **argv) {
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
-
+#if ROI_MAP
+  FILE *mask_file = NULL;
+  int block_size = 8;
+  int mask_rows = 0;
+  int mask_cols = 0;
+  int *mask_map;
+  int *prev_mask_map;
+#endif
   zero(rc.layer_target_bitrate);
   memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t));
   memset(&input_ctx, 0, sizeof(input_ctx));
@@ -657,9 +713,15 @@ int main(int argc, char **argv) {
     die("Invalid layering mode (0..12) %s", argv[12]);
   }
 
+#if ROI_MAP
+  if (argc != min_args + mode_to_num_layers[layering_mode] + 1) {
+    die("Invalid number of arguments");
+  }
+#else
   if (argc != min_args + mode_to_num_layers[layering_mode]) {
     die("Invalid number of arguments");
   }
+#endif
 
   input_ctx.filename = argv[1];
   open_input_file(&input_ctx);
@@ -687,14 +749,14 @@ int main(int argc, char **argv) {
             &raw,
             bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
             width, height, 32)) {
-      die("Failed to allocate image", width, height);
+      die("Failed to allocate image (%dx%d)", width, height);
     }
   }
 #else
   // Y4M reader has its own allocation.
   if (input_ctx.file_type != FILE_TYPE_Y4M) {
     if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
-      die("Failed to allocate image", width, height);
+      die("Failed to allocate image (%dx%d)", width, height);
     }
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -817,6 +879,13 @@ int main(int argc, char **argv) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     die("Failed to initialize encoder");
 
+#if ROI_MAP
+  mask_rows = (cfg.g_h + block_size - 1) / block_size;
+  mask_cols = (cfg.g_w + block_size - 1) / block_size;
+  mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map));
+  prev_mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map));
+#endif
+
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
     vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
@@ -827,10 +896,11 @@ int main(int argc, char **argv) {
     if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
       die_codec(&codec, "Failed to set ROI map");
 #endif
-
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     memset(&svc_params, 0, sizeof(svc_params));
+    vpx_codec_control(&codec, VP9E_SET_POSTENCODE_DROP, 0);
+    vpx_codec_control(&codec, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0);
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
     vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
@@ -840,12 +910,8 @@ int main(int argc, char **argv) {
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads));
-#if ROI_MAP
-    set_roi_map(encoder->name, &cfg, &roi);
-    if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
-      die_codec(&codec, "Failed to set ROI map");
-    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0);
-#endif
+    vpx_codec_control(&codec, VP9E_SET_DISABLE_LOOPFILTER, 0);
+
     if (cfg.g_threads > 1)
       vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
     else
@@ -878,6 +944,9 @@ int main(int argc, char **argv) {
     struct vpx_usec_timer timer;
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *pkt;
+#if ROI_MAP
+    char mask_file_name[255];
+#endif
     // Update the temporal layer_id. No spatial layers in this test.
     layer_id.spatial_layer_id = 0;
     layer_id.temporal_layer_id =
@@ -891,6 +960,19 @@ int main(int argc, char **argv) {
     }
     flags = layer_flags[frame_cnt % flag_periodicity];
     if (layering_mode == 0) flags = 0;
+#if ROI_MAP
+    snprintf(mask_file_name, sizeof(mask_file_name), "%s%05d.txt",
+             argv[argc - 1], frame_cnt);
+    mask_file = fopen(mask_file_name, "r");
+    if (mask_file != NULL) {
+      read_mask(mask_file, mask_map);
+      fclose(mask_file);
+      // set_roi_map(encoder->name, &cfg, &roi);
+      set_roi_skip_map(&cfg, &roi, mask_map, prev_mask_map, frame_cnt);
+      if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
+        die_codec(&codec, "Failed to set ROI map");
+    }
+#endif
     frame_avail = read_frame(&input_ctx, &raw);
     if (frame_avail) ++rc.layer_input_frames[layer_id.temporal_layer_id];
     vpx_usec_timer_start(&timer);
@@ -928,6 +1010,7 @@ int main(int argc, char **argv) {
           // Update for short-time encoding bitrate states, for moving window
           // of size rc->window, shifted by rc->window / 2.
           // Ignore first window segment, due to key frame.
+          if (rc.window_size == 0) rc.window_size = 15;
           if (frame_cnt > rc.window_size) {
             sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
             if (frame_cnt % rc.window_size == 0) {
@@ -959,6 +1042,10 @@ int main(int argc, char **argv) {
     ++frame_cnt;
     pts += frame_duration;
   }
+#if ROI_MAP
+  free(mask_map);
+  free(prev_mask_map);
+#endif
   close_input_file(&input_ctx);
   printout_rate_control_summary(&rc, &cfg, frame_cnt);
   printf("\n");
diff --git a/libs.doxy_template b/libs.doxy_template
index 1eacc8fe2d..1ee442af3e 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -654,12 +654,6 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = NO
 
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all
 # classes will be put under the same header in the alphabetical index.
 # The IGNORE_PREFIX tag can be used to specify one or more prefixes that
@@ -1099,32 +1093,10 @@ ALLEXTERNALS           = NO
 
 EXTERNAL_GROUPS        = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option is superseded by the HAVE_DOT option below. This is only a
-# fallback. It is recommended to install and use dot, since it yields more
-# powerful graphs.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
-# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
-# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
-# be found in the default search path.
-
-MSCGEN_PATH            =
-
 # If set to YES, the inheritance and collaboration graphs will hide
 # inheritance and usage relations if the target is undocumented
 # or is not a class.
@@ -1138,10 +1110,14 @@ HIDE_UNDOC_RELATIONS   = YES
 
 HAVE_DOT               = NO
 
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
 
 CLASS_GRAPH            = YES
 
diff --git a/libs.mk b/libs.mk
index acddb8c637..ff1c569c3b 100644
--- a/libs.mk
+++ b/libs.mk
@@ -63,6 +63,7 @@ ifeq ($(CONFIG_VP8_ENCODER),yes)
   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+  INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
   CODEC_DOC_SECTIONS += vp8 vp8_encoder
 endif
@@ -87,18 +88,34 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
   CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
   CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
   CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
+  CODEC_SRCS-yes += vpx/vpx_ext_ratectrl.h
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+  INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h vpx/vpx_ext_ratectrl.h
   CODEC_DOC_SECTIONS += vp9 vp9_encoder
+endif
 
-  RC_RTC_SRCS := $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
-  RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
+RC_RTC_SRCS := vpx/vp8.h vpx/vp8cx.h
+RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h
+RC_RTC_SRCS += vpx/internal/vpx_ratectrl_rtc.h
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+  VP9_PREFIX=vp9/
+  RC_RTC_SRCS += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
+  RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk
   RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.cc
   RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.h
   INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.cc
   INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.h
 endif
+ifeq ($(CONFIG_VP8_ENCODER),yes)
+  VP8_PREFIX=vp8/
+  RC_RTC_SRCS += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+endif
 
 ifeq ($(CONFIG_VP9_DECODER),yes)
   VP9_PREFIX=vp9/
@@ -122,7 +139,7 @@ endif
 ifeq ($(CONFIG_MSVS),yes)
 CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
 GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd)
-RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vp9rcmt,vp9rcmd)
+RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxrcmt,vpxrcmd)
 # This variable uses deferred expansion intentionally, since the results of
 # $(wildcard) may change during the course of the Make.
 VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
@@ -161,6 +178,7 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h
 INSTALL-LIBS-yes += include/vpx/vpx_integer.h
 INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
 INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
+INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 INSTALL-LIBS-yes                  += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
@@ -228,6 +246,7 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             --out=$@ $(CFLAGS) \
+            --as=$(AS) \
             $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
@@ -244,20 +263,21 @@ PROJECTS-yes += vpx.$(VCPROJ_SFX)
 vpx.$(VCPROJ_SFX): vpx_config.asm
 vpx.$(VCPROJ_SFX): $(RTCD)
 
-vp9rc.$(VCPROJ_SFX): \
+vpxrc.$(VCPROJ_SFX): \
     VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^)
 
-vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
+vpxrc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             $(if $(CONFIG_SHARED),--dll,--lib) \
             --target=$(TOOLCHAIN) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=vp9rc \
+            --name=vpxrc \
             --proj-guid=C26FF952-9494-4838-9A3F-7F3D4F613385 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             --out=$@ $(CFLAGS) \
+            --as=$(AS) \
             $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp9/%.cc, $(VCPROJ_SRCS)) \
             $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \
@@ -269,10 +289,10 @@ vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
               $(VCPROJ_SRCS)) \
             --src-path-bare="$(SRC_PATH_BARE)" \
 
-PROJECTS-yes += vp9rc.$(VCPROJ_SFX)
+PROJECTS-yes += vpxrc.$(VCPROJ_SFX)
 
-vp9rc.$(VCPROJ_SFX): vpx_config.asm
-vp9rc.$(VCPROJ_SFX): $(RTCD)
+vpxrc.$(VCPROJ_SFX): vpx_config.asm
+vpxrc.$(VCPROJ_SFX): $(RTCD)
 
 endif # ifeq ($(CONFIG_MSVS),yes)
 else # ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
@@ -281,9 +301,21 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 6
-SO_VERSION_MINOR := 3
-SO_VERSION_PATCH := 0
+# Updating version info.
+# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+# For libtool: c=<current>, a=<age>, r=<revision>
+# libtool generates .so file as .so.[c-a].a.r, while -version-info c:r:a is
+# passed to libtool.
+#
+# libvpx library file is generated as libvpx.so.<MAJOR>.<MINOR>.<PATCH>
+# MAJOR = c-a, MINOR = a, PATCH = r
+#
+# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
+# SO_VERSION_* then follow the rules in the link to detemine the new version
+# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
+SO_VERSION_MAJOR := 8
+SO_VERSION_MINOR := 0
+SO_VERSION_PATCH := 1
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
 SHARED_LIB_SUF          := .dylib
@@ -380,12 +412,11 @@ INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
 INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
 CLEAN-OBJS += vpx.pc
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
-  RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS))
+ifeq ($(CONFIG_ENCODERS),yes)
   RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS))
   OBJS-yes += $(RC_RTC_OBJS)
-  LIBS-yes += $(BUILD_PFX)libvp9rc.a $(BUILD_PFX)libvp9rc_g.a
-  $(BUILD_PFX)libvp9rc_g.a: $(RC_RTC_OBJS)
+  LIBS-yes += $(BUILD_PFX)libvpxrc.a $(BUILD_PFX)libvpxrc_g.a
+  $(BUILD_PFX)libvpxrc_g.a: $(RC_RTC_OBJS)
 endif
 
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_RATE_CTRL),yesyes)
@@ -416,13 +447,13 @@ ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes)
 # YASM
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " equ " $$3}' > $@
 else
 ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION))
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@
 	@echo "        END" $(ADS2GAS) >> $@
 CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm
@@ -475,10 +506,12 @@ TEST_INTRA_PRED_SPEED_SRCS=$(call addprefix_clean,test/,\
                            $(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
 TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
 
+ifeq ($(CONFIG_ENCODERS),yes)
 RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX)
 RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\
                        $(call enabled,RC_INTERFACE_TEST_SRCS))
 RC_INTERFACE_TEST_OBJS := $(sort $(call objs,$(RC_INTERFACE_TEST_SRCS)))
+endif
 
 SIMPLE_ENCODE_TEST_BIN=./test_simple_encode$(EXE_SFX)
 SIMPLE_ENCODE_TEST_SRCS=$(call addprefix_clean,test/,\
@@ -490,20 +523,21 @@ libvpx_test_srcs.txt:
 	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_test_srcs.txt
 
+# Attempt to download the file using curl, retrying once if it fails for a
+# partial file (18).
 $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	@echo "    [DOWNLOAD] $@"
-	# Attempt to download the file using curl, retrying once if it fails for a
-	# partial file (18).
 	$(qexec)( \
 	  trap 'rm -f $@' INT TERM; \
-	  curl="curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \
-	  $$curl; \
-	  case "$$?" in \
-	    18) $$curl -C -;; \
+	  curl="curl -S -s --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \
+	  $$curl; ret=$$?; \
+	  case "$$ret" in \
+	    18) $$curl -C - ;; \
+	    *) exit $$ret ;; \
 	  esac \
 	)
 
-testdata:: $(LIBVPX_TEST_DATA)
+testdata: $(LIBVPX_TEST_DATA)
 	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
           [ -x "$$(which shasum)" ] && sha1sum=shasum;\
           [ -x "$$(which sha1)" ] && sha1sum=sha1;\
@@ -512,7 +546,7 @@ testdata:: $(LIBVPX_TEST_DATA)
             echo "Checking test data:";\
             for f in $(call enabled,LIBVPX_TEST_DATA); do\
                 grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                    (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
+                    (cd "$(LIBVPX_TEST_DATA_PATH)"; $${sha1sum} -c);\
             done; \
         else\
             echo "Skipping test data integrity check, sha1sum not found.";\
@@ -531,6 +565,7 @@ gtest.$(VCPROJ_SFX): $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.c
             --proj-guid=EC00E1EC-AF68-4D92-A255-181690D1C9B1 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             -D_VARIADIC_MAX=10 \
             --out=gtest.$(VCPROJ_SFX) $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" -I"$(SRC_PATH_BARE)/third_party/googletest/src"
@@ -547,6 +582,7 @@ test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_
             --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
@@ -569,16 +605,18 @@ test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_
             --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
             -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
 endif  # TEST_INTRA_PRED_SPEED
 
+ifeq ($(CONFIG_ENCODERS),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX)
 test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
-	vp9rc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
+	vpxrc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             --exe \
@@ -587,13 +625,15 @@ test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
             -D_VARIADIC_MAX=10 \
             --proj-guid=30458F88-1BC6-4689-B41C-50F3737AAB27 \
             --ver=$(CONFIG_VS_VERSION) \
+            --as=$(AS) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
             -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^
 endif  # RC_INTERFACE_TEST
-endif
+endif  # CONFIG_ENCODERS
+endif  # CONFIG_MSVS
 else
 
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
@@ -634,17 +674,19 @@ $(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \
               -L. -lvpx -lgtest $(extralibs) -lm))
 endif  # TEST_INTRA_PRED_SPEED
 
+ifeq ($(CONFIG_ENCODERS),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 $(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \
   CXXFLAGS += $(GTEST_INCLUDES)
 OBJS-yes += $(RC_INTERFACE_TEST_OBJS)
 BINS-yes += $(RC_INTERFACE_TEST_BIN)
 
-$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvp9rc.a
+$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvpxrc.a
 $(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \
               $(RC_INTERFACE_TEST_OBJS) \
-              -L. -lvpx -lgtest -lvp9rc $(extralibs) -lm))
+              -L. -lvpx -lgtest -lvpxrc $(extralibs) -lm))
 endif  # RC_INTERFACE_TEST
+endif  # CONFIG_ENCODERS
 
 ifneq ($(strip $(SIMPLE_ENCODE_TEST_OBJS)),)
 $(SIMPLE_ENCODE_TEST_OBJS) $(SIMPLE_ENCODE_TEST_OBJS:.o=.d): \
@@ -658,7 +700,7 @@ $(eval $(call linkerxx_template,$(SIMPLE_ENCODE_TEST_BIN), \
               -L. -lsimple_encode -lvpx -lgtest $(extralibs) -lm))
 endif  # SIMPLE_ENCODE_TEST
 
-endif  # CONFIG_UNIT_TESTS
+endif  # CONFIG_EXTERNAL_BUILD
 
 # Install test sources only if codec source is included
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
@@ -668,22 +710,22 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(RC_INTERFACE_TEST_SRCS)
 
 define test_shard_template
-test:: test_shard.$(1)
-test-no-data-check:: test_shard_ndc.$(1)
+test: test_shard.$(1)
+test-no-data-check: test_shard_ndc.$(1)
 test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
 	@set -e; \
 	 export GTEST_SHARD_INDEX=$(1); \
 	 export GTEST_TOTAL_SHARDS=$(2); \
 	 $(LIBVPX_TEST_BIN)
 test_shard.$(1): testdata
-.PHONY: test_shard.$(1)
+.PHONY: test_shard.$(1) test_shard_ndc.$(1)
 endef
 
 NUM_SHARDS := 10
 SHARDS := 0 1 2 3 4 5 6 7 8 9
 $(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS))))
 
-endif
+endif  # CONFIG_UNIT_TESTS
 
 ##
 ## documentation directives
@@ -723,10 +765,10 @@ TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
 endif
 utiltest utiltest-no-data-check:
 	$(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(TEST_BIN_PATH)
 	$(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(TEST_BIN_PATH)
 utiltest: testdata
 else
@@ -750,7 +792,7 @@ EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
 endif
 exampletest exampletest-no-data-check: examples
 	$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(EXAMPLES_BIN_PATH)
 exampletest: testdata
 else
diff --git a/md5_utils.c b/md5_utils.c
index c4106525f2..abd8d43c39 100644
--- a/md5_utils.c
+++ b/md5_utils.c
@@ -151,8 +151,8 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
  * reflect the addition of 16 longwords of new data.  MD5Update blocks
  * the data and converts bytes into longwords for this routine.
  */
-VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
-                                                 UWORD32 const in[16]) {
+VPX_NO_UNSIGNED_OVERFLOW_CHECK VPX_NO_UNSIGNED_SHIFT_CHECK void MD5Transform(
+    UWORD32 buf[4], UWORD32 const in[16]) {
   UWORD32 a, b, c, d;
 
   a = buf[0];
diff --git a/rate_hist.c b/rate_hist.c
index 6cf8ce7bb0..947950d481 100644
--- a/rate_hist.c
+++ b/rate_hist.c
@@ -193,40 +193,42 @@ static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
 
 static void show_histogram(const struct hist_bucket *bucket, int buckets,
                            int total, int scale) {
-  const char *pat1, *pat2;
+  int width1, width2;
   int i;
 
+  if (!buckets) return;
   assert(bucket != NULL);
+  assert(buckets > 0);
 
   switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
     case 1:
     case 2:
-      pat1 = "%4d %2s: ";
-      pat2 = "%4d-%2d: ";
+      width1 = 4;
+      width2 = 2;
       break;
     case 3:
-      pat1 = "%5d %3s: ";
-      pat2 = "%5d-%3d: ";
+      width1 = 5;
+      width2 = 3;
       break;
     case 4:
-      pat1 = "%6d %4s: ";
-      pat2 = "%6d-%4d: ";
+      width1 = 6;
+      width2 = 4;
       break;
     case 5:
-      pat1 = "%7d %5s: ";
-      pat2 = "%7d-%5d: ";
+      width1 = 7;
+      width2 = 5;
       break;
     case 6:
-      pat1 = "%8d %6s: ";
-      pat2 = "%8d-%6d: ";
+      width1 = 8;
+      width2 = 6;
       break;
     case 7:
-      pat1 = "%9d %7s: ";
-      pat2 = "%9d-%7d: ";
+      width1 = 9;
+      width2 = 7;
       break;
     default:
-      pat1 = "%12d %10s: ";
-      pat2 = "%12d-%10d: ";
+      width1 = 12;
+      width2 = 10;
       break;
   }
 
@@ -241,9 +243,10 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets,
     assert(len <= HIST_BAR_MAX);
 
     if (bucket[i].low == bucket[i].high)
-      fprintf(stderr, pat1, bucket[i].low, "");
+      fprintf(stderr, "%*d %*s: ", width1, bucket[i].low, width2, "");
     else
-      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
+      fprintf(stderr, "%*d-%*d: ", width1, bucket[i].low, width2,
+              bucket[i].high);
 
     for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " ");
     fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
diff --git a/test/acm_random.h b/test/acm_random.h
index 3458340a12..e3520c47de 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -28,43 +28,38 @@ class ACMRandom {
   explicit ACMRandom(int seed) : random_(seed) {}
 
   void Reset(int seed) { random_.Reseed(seed); }
-  uint16_t Rand16(void) {
+  uint16_t Rand16() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     return (value >> 15) & 0xffff;
   }
 
-  int32_t Rand20Signed(void) {
+  int32_t Rand20Signed() {
     // Use 20 bits: values between 524287 and -524288.
     const uint32_t value = random_.Generate(1048576);
     return static_cast<int32_t>(value) - 524288;
   }
 
-  int16_t Rand16Signed(void) {
+  int16_t Rand16Signed() {
     // Use 16 bits: values between 32767 and -32768.
     return static_cast<int16_t>(random_.Generate(65536));
   }
 
-  int16_t Rand13Signed(void) {
-    // Use 13 bits: values between 4095 and -4096.
-    const uint32_t value = random_.Generate(8192);
-    return static_cast<int16_t>(value) - 4096;
-  }
-
-  int16_t Rand9Signed(void) {
-    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
-    const uint32_t value = random_.Generate(512);
-    return static_cast<int16_t>(value) - 256;
+  uint16_t Rand12() {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 19) & 0xfff;
   }
 
-  uint8_t Rand8(void) {
+  uint8_t Rand8() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 23) & 0xff;
   }
 
-  uint8_t Rand8Extremes(void) {
+  uint8_t Rand8Extremes() {
     // Returns a random value near 0 or near 255, to better exercise
     // saturation behavior.
     const uint8_t r = Rand8();
@@ -82,7 +77,7 @@ class ACMRandom {
 
   int operator()(int n) { return PseudoUniform(n); }
 
-  static int DeterministicSeed(void) { return 0xbaba; }
+  static int DeterministicSeed() { return 0xbaba; }
 
  private:
   testing::internal::Random random_;
diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc
index 208d7c6d79..ad067346a7 100644
--- a/test/active_map_refresh_test.cc
+++ b/test/active_map_refresh_test.cc
@@ -62,16 +62,16 @@ class ActiveMapRefreshTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapRefreshTest() {}
+  ~ActiveMapRefreshTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     ::libvpx_test::Y4mVideoSource *y4m_video =
         static_cast<libvpx_test::Y4mVideoSource *>(video);
     if (video->frame() == 0) {
@@ -80,7 +80,7 @@ class ActiveMapRefreshTest
     } else if (video->frame() >= 2 && video->img()) {
       vpx_image_t *current = video->img();
       vpx_image_t *previous = y4m_holder_->img();
-      ASSERT_TRUE(previous != NULL);
+      ASSERT_NE(previous, nullptr);
       vpx_active_map_t map = vpx_active_map_t();
       const int width = static_cast<int>(current->d_w);
       const int height = static_cast<int>(current->d_h);
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index e08d9a0367..d222c00b74 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -19,24 +19,26 @@ namespace {
 
 class ActiveMapTest
     : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, int,
+                                                 int> {
  protected:
   static const int kWidth = 208;
   static const int kHeight = 144;
 
   ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapTest() {}
+  ~ActiveMapTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3));
     } else if (video->frame() == 3) {
       vpx_active_map_t map = vpx_active_map_t();
       /* clang-format off */
@@ -62,7 +64,7 @@ class ActiveMapTest
       vpx_active_map_t map = vpx_active_map_t();
       map.cols = (kWidth + 15) / 16;
       map.rows = (kHeight + 15) / 16;
-      map.active_map = NULL;
+      map.active_map = nullptr;
       encoder->Control(VP8E_SET_ACTIVEMAP, &map);
     }
   }
@@ -87,5 +89,5 @@ TEST_P(ActiveMapTest, Test) {
 
 VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest,
                            ::testing::Values(::libvpx_test::kRealTime),
-                           ::testing::Range(0, 9));
+                           ::testing::Range(5, 10), ::testing::Values(0, 3));
 }  // namespace
diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc
index 1c4403ef6a..4fc4e81e63 100644
--- a/test/add_noise_test.cc
+++ b/test/add_noise_test.cc
@@ -23,7 +23,6 @@ namespace {
 
 static const int kNoiseSize = 3072;
 
-// TODO(jimbankoski): make width and height integers not unsigned.
 typedef void (*AddNoiseFunc)(uint8_t *start, const int8_t *noise,
                              int blackclamp, int whiteclamp, int width,
                              int height, int pitch);
@@ -33,8 +32,8 @@ typedef std::tuple<double, AddNoiseFunc> AddNoiseTestFPParam;
 class AddNoiseTest : public ::testing::Test,
                      public ::testing::WithParamInterface<AddNoiseTestFPParam> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-  virtual ~AddNoiseTest() {}
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+  ~AddNoiseTest() override = default;
 };
 
 double stddev6(char a, char b, char c, char d, char e, char f) {
@@ -53,7 +52,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
   const int clamp = vpx_setup_noise(GET_PARAM(0), noise, kNoiseSize);
   uint8_t *const s =
       reinterpret_cast<uint8_t *>(vpx_calloc(image_size, sizeof(*s)));
-  ASSERT_TRUE(s != NULL);
+  ASSERT_NE(s, nullptr);
   memset(s, 99, image_size * sizeof(*s));
 
   ASM_REGISTER_STATE_CHECK(
@@ -106,8 +105,8 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) {
 
   uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
   uint8_t *const d = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
-  ASSERT_TRUE(s != NULL);
-  ASSERT_TRUE(d != NULL);
+  ASSERT_NE(s, nullptr);
+  ASSERT_NE(d, nullptr);
 
   memset(s, 99, image_size);
   memset(d, 99, image_size);
diff --git a/test/alt_ref_aq_segment_test.cc b/test/alt_ref_aq_segment_test.cc
index 00a00e27c5..3b1a26ed16 100644
--- a/test/alt_ref_aq_segment_test.cc
+++ b/test/alt_ref_aq_segment_test.cc
@@ -20,9 +20,9 @@ class AltRefAqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~AltRefAqSegmentTest() {}
+  ~AltRefAqSegmentTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
@@ -30,8 +30,8 @@ class AltRefAqSegmentTest
     alt_ref_aq_mode_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_);
diff --git a/test/altref_test.cc b/test/altref_test.cc
index 69bcef774e..903230fde9 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -24,24 +24,24 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
                    public ::libvpx_test::CodecTestWithParam<int> {
  protected:
   AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {}
-  virtual ~AltRefTest() {}
+  ~AltRefTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) { altref_count_ = 0; }
+  void BeginPassHook(unsigned int /*pass*/) override { altref_count_ = 0; }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_CPUUSED, 3);
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_;
   }
 
@@ -75,17 +75,17 @@ class AltRefForcedKeyTestLarge
   AltRefForcedKeyTestLarge()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
-  virtual ~AltRefForcedKeyTestLarge() {}
+  ~AltRefForcedKeyTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     cfg_.rc_end_usage = VPX_VBR;
     cfg_.g_threads = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
@@ -100,7 +100,7 @@ class AltRefForcedKeyTestLarge
         (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame_num_ == forced_kf_frame_num_) {
       ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY))
           << "Frame #" << frame_num_ << " isn't a keyframe!";
diff --git a/test/android/Android.mk b/test/android/Android.mk
index 7318de2fc4..9a7533ebba 100644
--- a/test/android/Android.mk
+++ b/test/android/Android.mk
@@ -10,6 +10,9 @@
 # The test app itself runs on the command line through adb shell
 # The paths are really messed up as the libvpx make file
 # expects to be made from a parent directory.
+
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 CUR_WD := $(call my-dir)
 BINDINGS_DIR := $(CUR_WD)/../../..
 LOCAL_PATH := $(CUR_WD)/../../..
@@ -34,6 +37,9 @@ LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 include $(BUILD_STATIC_LIBRARY)
 
 #libvpx_test
@@ -48,6 +54,9 @@ else
   LOCAL_STATIC_LIBRARIES += vpx
 endif
 
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 include $(LOCAL_PATH)/test/test.mk
 LOCAL_C_INCLUDES := $(BINDINGS_DIR)
 FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
@@ -55,3 +64,4 @@ LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
 # some test files depend on *_rtcd.h, ensure they're generated first.
 $(eval $(call rtcd_dep_template))
 include $(BUILD_EXECUTABLE)
+endif  # NDK_ROOT
diff --git a/test/android/README b/test/android/README
index f67fea50c7..0cd30779d4 100644
--- a/test/android/README
+++ b/test/android/README
@@ -16,7 +16,7 @@ Note: Both adb and ndk-build are available at:
 
 3) Run get_files.py to download the test files:
 python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \
-  -u http://downloads.webmproject.org/test_data/libvpx
+  -u https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx
 
 4) Transfer files to device using adb. Ensure you have proper permissions for
 the target
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index 2cbc991d0c..955e1dafc0 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -20,17 +20,17 @@ class AqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~AqSegmentTest() {}
+  ~AqSegmentTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
     aq_mode_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 601fb19012..ede9c0ba8c 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -35,12 +35,12 @@ template <typename Pixel>
 class AverageTestBase : public ::testing::Test {
  public:
   AverageTestBase(int width, int height)
-      : width_(width), height_(height), source_data_(NULL), source_stride_(0),
-        bit_depth_(8) {}
+      : width_(width), height_(height), source_data_(nullptr),
+        source_stride_(0), bit_depth_(8) {}
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data_);
-    source_data_ = NULL;
+    source_data_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -49,10 +49,10 @@ class AverageTestBase : public ::testing::Test {
   static const int kDataAlignment = 16;
   static const int kDataBlockSize = 64 * 128;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_data_ = reinterpret_cast<Pixel *>(
         vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
-    ASSERT_TRUE(source_data_ != NULL);
+    ASSERT_NE(source_data_, nullptr);
     source_stride_ = (width_ + 31) & ~31;
     bit_depth_ = 8;
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -152,6 +152,7 @@ class AverageTestHBD : public AverageTestBase<uint16_t>,
 };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA
 typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
                               const int ref_stride, const int height);
 
@@ -161,16 +162,17 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProRowParam> {
  public:
   IntProRowTest()
-      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) {
+      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr),
+        hbuf_c_(nullptr) {
     asm_func_ = GET_PARAM(1);
     c_func_ = GET_PARAM(2);
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     source_data_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
-    ASSERT_TRUE(source_data_ != NULL);
+    ASSERT_NE(source_data_, nullptr);
 
     hbuf_asm_ = reinterpret_cast<int16_t *>(
         vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
@@ -178,18 +180,19 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
         vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data_);
-    source_data_ = NULL;
+    source_data_ = nullptr;
     vpx_free(hbuf_c_);
-    hbuf_c_ = NULL;
+    hbuf_c_ = nullptr;
     vpx_free(hbuf_asm_);
-    hbuf_asm_ = NULL;
+    hbuf_asm_ = nullptr;
   }
 
   void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_));
+    ASM_REGISTER_STATE_CHECK(
+        asm_func_(hbuf_asm_, source_data_, width_, height_));
     EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
         << "Output mismatch";
   }
@@ -200,6 +203,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   int16_t *hbuf_asm_;
   int16_t *hbuf_c_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
 
 typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
 
@@ -226,6 +230,8 @@ class IntProColTest : public AverageTestBase<uint8_t>,
   int16_t sum_asm_;
   int16_t sum_c_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
+#endif  // HAVE_NEON || HAVE_SSE2 || HAVE_MSA
 
 typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
 typedef std::tuple<int, SatdFunc> SatdTestParam;
@@ -233,16 +239,16 @@ typedef std::tuple<int, SatdFunc> SatdTestParam;
 class SatdTest : public ::testing::Test,
                  public ::testing::WithParamInterface<SatdTestParam> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     satd_size_ = GET_PARAM(0);
     satd_func_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
     src_ = reinterpret_cast<tran_low_t *>(
         vpx_memalign(16, sizeof(*src_) * satd_size_));
-    ASSERT_TRUE(src_ != NULL);
+    ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     libvpx_test::ClearSystemState();
     vpx_free(src_);
   }
@@ -271,7 +277,7 @@ class SatdTest : public ::testing::Test,
 
 class SatdLowbdTest : public SatdTest {
  protected:
-  virtual void FillRandom() {
+  void FillRandom() override {
     for (int i = 0; i < satd_size_; ++i) {
       const int16_t tmp = rnd_.Rand16Signed();
       src_[i] = (tran_low_t)tmp;
@@ -287,7 +293,7 @@ class BlockErrorTestFP
     : public ::testing::Test,
       public ::testing::WithParamInterface<BlockErrorTestFPParam> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     txfm_size_ = GET_PARAM(0);
     block_error_func_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -295,11 +301,11 @@ class BlockErrorTestFP
         vpx_memalign(16, sizeof(*coeff_) * txfm_size_));
     dqcoeff_ = reinterpret_cast<tran_low_t *>(
         vpx_memalign(16, sizeof(*dqcoeff_) * txfm_size_));
-    ASSERT_TRUE(coeff_ != NULL);
-    ASSERT_TRUE(dqcoeff_ != NULL);
+    ASSERT_NE(coeff_, nullptr);
+    ASSERT_NE(dqcoeff_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     libvpx_test::ClearSystemState();
     vpx_free(coeff_);
     vpx_free(dqcoeff_);
@@ -378,6 +384,7 @@ TEST_P(AverageTestHBD, Random) {
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA
 TEST_P(IntProRowTest, MinValue) {
   FillConstant(0);
   RunComparison();
@@ -407,6 +414,7 @@ TEST_P(IntProColTest, Random) {
   FillRandom();
   RunComparison();
 }
+#endif
 
 TEST_P(SatdLowbdTest, MinValue) {
   const int kMin = -32640;
@@ -456,7 +464,7 @@ TEST_P(SatdLowbdTest, DISABLED_Speed) {
 #if CONFIG_VP9_HIGHBITDEPTH
 class SatdHighbdTest : public SatdTest {
  protected:
-  virtual void FillRandom() {
+  void FillRandom() override {
     for (int i = 0; i < satd_size_; ++i) {
       src_[i] = rnd_.Rand20Signed();
     }
@@ -575,6 +583,13 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2)));
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AverageTestHBD,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_neon),
+                      make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_neon)));
+#endif  // HAVE_NEON
+
 INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest,
                          ::testing::Values(make_tuple(16, &vpx_satd_c),
                                            make_tuple(64, &vpx_satd_c),
@@ -687,16 +702,21 @@ INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest,
                                            make_tuple(256, &vpx_satd_neon),
                                            make_tuple(1024, &vpx_satd_neon)));
 
-// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are
-// in place.
-#if !CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdHighbdTest,
+    ::testing::Values(make_tuple(16, &vpx_highbd_satd_neon),
+                      make_tuple(64, &vpx_highbd_satd_neon),
+                      make_tuple(256, &vpx_highbd_satd_neon),
+                      make_tuple(1024, &vpx_highbd_satd_neon)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, BlockErrorTestFP,
     ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),
                       make_tuple(64, &vp9_block_error_fp_neon),
                       make_tuple(256, &vp9_block_error_fp_neon),
                       make_tuple(1024, &vp9_block_error_fp_neon)));
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/test/bench.h b/test/bench.h
index 57ca9118ba..203e4d247e 100644
--- a/test/bench.h
+++ b/test/bench.h
@@ -16,6 +16,8 @@
 
 class AbstractBench {
  public:
+  virtual ~AbstractBench() = default;
+
   void RunNTimes(int n);
   void PrintMedian(const char *title);
 
diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc
index 1ad444a044..5a45bc0b7f 100644
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -44,19 +44,19 @@ class BlockinessTestBase : public ::testing::Test {
 
   static void TearDownTestSuite() {
     vpx_free(source_data_);
-    source_data_ = NULL;
+    source_data_ = nullptr;
     vpx_free(reference_data_);
-    reference_data_ = NULL;
+    reference_data_ = nullptr;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   // Handle frames up to 640x480
   static const int kDataAlignment = 16;
   static const int kDataBufferSize = 640 * 480;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -154,8 +154,8 @@ class BlockinessVP9Test
 };
 #endif  // CONFIG_VP9_ENCODER
 
-uint8_t *BlockinessTestBase::source_data_ = NULL;
-uint8_t *BlockinessTestBase::reference_data_ = NULL;
+uint8_t *BlockinessTestBase::source_data_ = nullptr;
+uint8_t *BlockinessTestBase::reference_data_ = nullptr;
 
 #if CONFIG_VP9_ENCODER
 TEST_P(BlockinessVP9Test, SourceBlockierThanReference) {
diff --git a/test/borders_test.cc b/test/borders_test.cc
index 3c1f69a923..2726bd557d 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -22,15 +22,15 @@ class BordersTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   BordersTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~BordersTest() {}
+  ~BordersTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 1);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
@@ -40,7 +40,7 @@ class BordersTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
     }
   }
diff --git a/test/buffer.h b/test/buffer.h
index b003d2f0d0..023939cedf 100644
--- a/test/buffer.h
+++ b/test/buffer.h
@@ -31,7 +31,7 @@ class Buffer {
       : width_(width), height_(height), top_padding_(top_padding),
         left_padding_(left_padding), right_padding_(right_padding),
         bottom_padding_(bottom_padding), alignment_(0), padding_value_(0),
-        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int top_padding, int left_padding,
          int right_padding, int bottom_padding, unsigned int alignment)
@@ -39,19 +39,19 @@ class Buffer {
         left_padding_(left_padding), right_padding_(right_padding),
         bottom_padding_(bottom_padding), alignment_(alignment),
         padding_value_(0), stride_(0), raw_size_(0), num_elements_(0),
-        raw_buffer_(NULL) {}
+        raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int padding)
       : width_(width), height_(height), top_padding_(padding),
         left_padding_(padding), right_padding_(padding),
         bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0),
-        raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int padding, unsigned int alignment)
       : width_(width), height_(height), top_padding_(padding),
         left_padding_(padding), right_padding_(padding),
         bottom_padding_(padding), alignment_(alignment), padding_value_(0),
-        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   ~Buffer() {
     if (alignment_) {
@@ -103,7 +103,7 @@ class Buffer {
   bool CheckValues(const Buffer<T> &a) const;
 
   bool Init() {
-    if (raw_buffer_ != NULL) return false;
+    if (raw_buffer_ != nullptr) return false;
     EXPECT_GT(width_, 0);
     EXPECT_GT(height_, 0);
     EXPECT_GE(top_padding_, 0);
@@ -126,7 +126,7 @@ class Buffer {
     } else {
       raw_buffer_ = new (std::nothrow) T[num_elements_];
     }
-    EXPECT_TRUE(raw_buffer_ != NULL);
+    EXPECT_NE(raw_buffer_, nullptr);
     SetPadding(std::numeric_limits<T>::max());
     return !::testing::Test::HasFailure();
   }
@@ -150,7 +150,7 @@ class Buffer {
 
 template <typename T>
 T *Buffer<T>::TopLeftPixel() const {
-  if (!raw_buffer_) return NULL;
+  if (!raw_buffer_) return nullptr;
   return raw_buffer_ + (top_padding_ * stride_) + left_padding_;
 }
 
diff --git a/test/byte_alignment_test.cc b/test/byte_alignment_test.cc
index 9feb7bdce7..ba6fffc524 100644
--- a/test/byte_alignment_test.cc
+++ b/test/byte_alignment_test.cc
@@ -55,23 +55,24 @@ const ByteAlignmentTestParam kBaTestParams[] = {
 class ByteAlignmentTest
     : public ::testing::TestWithParam<ByteAlignmentTestParam> {
  protected:
-  ByteAlignmentTest() : video_(NULL), decoder_(NULL), md5_file_(NULL) {}
+  ByteAlignmentTest()
+      : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
-    ASSERT_TRUE(video_ != NULL);
+    ASSERT_NE(video_, nullptr);
     video_->Init();
     video_->Begin();
 
     const vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
-    ASSERT_TRUE(decoder_ != NULL);
+    ASSERT_NE(decoder_, nullptr);
 
     OpenMd5File(kVP9Md5File);
   }
 
-  virtual void TearDown() {
-    if (md5_file_ != NULL) fclose(md5_file_);
+  void TearDown() override {
+    if (md5_file_ != nullptr) fclose(md5_file_);
 
     delete decoder_;
     delete video_;
@@ -90,7 +91,7 @@ class ByteAlignmentTest
   }
 
   vpx_codec_err_t DecodeRemainingFrames(int byte_alignment_to_check) {
-    for (; video_->cxdata() != NULL; video_->Next()) {
+    for (; video_->cxdata() != nullptr; video_->Next()) {
       const vpx_codec_err_t res =
           decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
       if (res != VPX_CODEC_OK) return res;
@@ -113,7 +114,7 @@ class ByteAlignmentTest
     const vpx_image_t *img;
 
     // Get decompressed data
-    while ((img = dec_iter.Next()) != NULL) {
+    while ((img = dec_iter.Next()) != nullptr) {
       if (byte_alignment_to_check == kLegacyByteAlignment) {
         CheckByteAlignment(img->planes[0], kLegacyYPlaneByteAlignment);
       } else {
@@ -128,12 +129,12 @@ class ByteAlignmentTest
   // TODO(fgalligan): Move the MD5 testing code into another class.
   void OpenMd5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL)
+    ASSERT_NE(md5_file_, nullptr)
         << "MD5 file open failed. Filename: " << md5_file_name_;
   }
 
   void CheckMd5(const vpx_image_t &img) {
-    ASSERT_TRUE(md5_file_ != NULL);
+    ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
 
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 77ce49de9f..c7e8f54847 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -40,7 +40,7 @@ class CodecFactory {
                                  const vpx_codec_flags_t flags) const = 0;
 
   virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
+                                 vpx_enc_deadline_t deadline,
                                  const unsigned long init_flags,
                                  TwopassStatsStore *stats) const = 0;
 
@@ -84,27 +84,27 @@ class VP8Decoder : public Decoder {
       : Decoder(cfg, flag) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP8_DECODER
     return &vpx_codec_vp8_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
 
 class VP8Encoder : public Encoder {
  public:
-  VP8Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  VP8Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
              const unsigned long init_flags, TwopassStatsStore *stats)
       : Encoder(cfg, deadline, init_flags, stats) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP8_ENCODER
     return &vpx_codec_vp8_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -113,25 +113,24 @@ class VP8CodecFactory : public CodecFactory {
  public:
   VP8CodecFactory() : CodecFactory() {}
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override {
     return CreateDecoder(cfg, 0);
   }
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 const vpx_codec_flags_t flags) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                         const vpx_codec_flags_t flags) const override {
 #if CONFIG_VP8_DECODER
     return new VP8Decoder(cfg, flags);
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
-  virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
-                                 const unsigned long init_flags,
-                                 TwopassStatsStore *stats) const {
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
+                         const unsigned long init_flags,
+                         TwopassStatsStore *stats) const override {
 #if CONFIG_VP8_ENCODER
     return new VP8Encoder(cfg, deadline, init_flags, stats);
 #else
@@ -139,12 +138,12 @@ class VP8CodecFactory : public CodecFactory {
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
-  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
-                                               int usage) const {
+  vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                       int usage) const override {
 #if CONFIG_VP8_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage);
 #else
@@ -180,27 +179,27 @@ class VP9Decoder : public Decoder {
       : Decoder(cfg, flag) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_DECODER
     return &vpx_codec_vp9_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
 
 class VP9Encoder : public Encoder {
  public:
-  VP9Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  VP9Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
              const unsigned long init_flags, TwopassStatsStore *stats)
       : Encoder(cfg, deadline, init_flags, stats) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -209,25 +208,24 @@ class VP9CodecFactory : public CodecFactory {
  public:
   VP9CodecFactory() : CodecFactory() {}
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override {
     return CreateDecoder(cfg, 0);
   }
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 const vpx_codec_flags_t flags) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                         const vpx_codec_flags_t flags) const override {
 #if CONFIG_VP9_DECODER
     return new VP9Decoder(cfg, flags);
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
-  virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
-                                 const unsigned long init_flags,
-                                 TwopassStatsStore *stats) const {
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
+                         const unsigned long init_flags,
+                         TwopassStatsStore *stats) const override {
 #if CONFIG_VP9_ENCODER
     return new VP9Encoder(cfg, deadline, init_flags, stats);
 #else
@@ -235,12 +233,12 @@ class VP9CodecFactory : public CodecFactory {
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
-  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
-                                               int usage) const {
+  vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                       int usage) const override {
 #if CONFIG_VP9_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);
 #else
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index b33bc3fba9..3234cc9a25 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -22,42 +22,51 @@ namespace {
 using ::libvpx_test::ACMRandom;
 using ::libvpx_test::Buffer;
 
-typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h,
-                            const uint8_t *c, int c_stride);
-
-uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; }
+template <typename Pixel>
+Pixel avg_with_rounding(Pixel a, Pixel b) {
+  return (a + b + 1) >> 1;
+}
 
-void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
-                    int width, int height, Buffer<uint8_t> *avg) {
-  ASSERT_TRUE(avg->TopLeftPixel() != NULL);
-  ASSERT_TRUE(pred.TopLeftPixel() != NULL);
-  ASSERT_TRUE(ref.TopLeftPixel() != NULL);
+template <typename Pixel>
+void reference_pred(const Buffer<Pixel> &pred, const Buffer<Pixel> &ref,
+                    int width, int height, Buffer<Pixel> *avg) {
+  ASSERT_NE(avg->TopLeftPixel(), nullptr);
+  ASSERT_NE(pred.TopLeftPixel(), nullptr);
+  ASSERT_NE(ref.TopLeftPixel(), nullptr);
 
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       avg->TopLeftPixel()[y * avg->stride() + x] =
-          avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x],
-                            ref.TopLeftPixel()[y * ref.stride() + x]);
+          avg_with_rounding<Pixel>(pred.TopLeftPixel()[y * pred.stride() + x],
+                                   ref.TopLeftPixel()[y * ref.stride() + x]);
     }
   }
 }
 
+using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h,
+                             const uint8_t *c, int c_stride);
+
+template <int bitdepth, typename Pixel>
 class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     avg_pred_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
+  void TestSizeCombinations();
+  void TestCompareReferenceRandom();
+  void TestSpeed();
+
  protected:
   AvgPredFunc avg_pred_func_;
   ACMRandom rnd_;
 };
 
-TEST_P(AvgPredTest, SizeCombinations) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() {
   // This is called as part of the sub pixel variance. As such it must be one of
   // the variance block sizes.
-
   for (int width_pow = 2; width_pow <= 6; ++width_pow) {
     for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
          ++height_pow) {
@@ -70,23 +79,30 @@ TEST_P(AvgPredTest, SizeCombinations) {
         const int width = 1 << width_pow;
         const int height = 1 << height_pow;
         // Only the reference buffer may have a stride not equal to width.
-        Buffer<uint8_t> ref =
-            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(pred.Init());
-        Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg_ref.Init());
-        Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg_chk.Init());
+        const int bitdepth_mask = (1 << bitdepth) - 1;
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
 
-        ref.Set(&rnd_, &ACMRandom::Rand8);
-        pred.Set(&rnd_, &ACMRandom::Rand8);
-
-        reference_pred(pred, ref, width, height, &avg_ref);
-        ASM_REGISTER_STATE_CHECK(
-            avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width,
-                           height, ref.TopLeftPixel(), ref.stride()));
+        reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+        ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+            (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+            width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
 
         EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
         if (HasFailure()) {
@@ -99,26 +115,36 @@ TEST_P(AvgPredTest, SizeCombinations) {
   }
 }
 
-TEST_P(AvgPredTest, CompareReferenceRandom) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() {
   const int width = 64;
   const int height = 32;
-  Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
+  Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
   ASSERT_TRUE(ref.Init());
-  Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(pred.Init());
-  Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(avg_ref.Init());
-  Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(avg_chk.Init());
 
   for (int i = 0; i < 500; ++i) {
-    ref.Set(&rnd_, &ACMRandom::Rand8);
-    pred.Set(&rnd_, &ACMRandom::Rand8);
+    const int bitdepth_mask = (1 << bitdepth) - 1;
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+      }
+    }
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+      }
+    }
 
-    reference_pred(pred, ref, width, height, &avg_ref);
-    ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(),
-                                            pred.TopLeftPixel(), width, height,
-                                            ref.TopLeftPixel(), ref.stride()));
+    reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+    ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+        (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+        width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
     EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
     if (HasFailure()) {
       printf("Width: %d Height: %d\n", width, height);
@@ -128,7 +154,8 @@ TEST_P(AvgPredTest, CompareReferenceRandom) {
   }
 }
 
-TEST_P(AvgPredTest, DISABLED_Speed) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
   for (int width_pow = 2; width_pow <= 6; ++width_pow) {
     for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
          ++height_pow) {
@@ -138,22 +165,30 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
       for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
         const int width = 1 << width_pow;
         const int height = 1 << height_pow;
-        Buffer<uint8_t> ref =
-            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(pred.Init());
-        Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg.Init());
-
-        ref.Set(&rnd_, &ACMRandom::Rand8);
-        pred.Set(&rnd_, &ACMRandom::Rand8);
+        const int bitdepth_mask = (1 << bitdepth) - 1;
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
 
         vpx_usec_timer timer;
         vpx_usec_timer_start(&timer);
-        for (int i = 0; i < 10000000 / (width * height); ++i) {
-          avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height,
-                         ref.TopLeftPixel(), ref.stride());
+        for (int i = 0; i < 100000000 / (width * height); ++i) {
+          avg_pred_func_((uint8_t *)avg.TopLeftPixel(),
+                         (uint8_t *)pred.TopLeftPixel(), width, height,
+                         (uint8_t *)ref.TopLeftPixel(), ref.stride());
         }
         vpx_usec_timer_mark(&timer);
 
@@ -166,21 +201,75 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
   }
 }
 
-INSTANTIATE_TEST_SUITE_P(C, AvgPredTest,
+using AvgPredTestLBD = AvgPredTest<8, uint8_t>;
+
+TEST_P(AvgPredTestLBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestLBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestLBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_c));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_sse2));
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_avx2));
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_neon));
 #endif  // HAVE_NEON
 
 #if HAVE_VSX
-INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_vsx));
 #endif  // HAVE_VSX
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_lsx));
+#endif  // HAVE_LSX
+
+#if CONFIG_VP9_HIGHBITDEPTH
+using HighbdAvgPredFunc = void (*)(uint16_t *a, const uint16_t *b, int w, int h,
+                                   const uint16_t *c, int c_stride);
+
+template <HighbdAvgPredFunc fn>
+void highbd_wrapper(uint8_t *a, const uint8_t *b, int w, int h,
+                    const uint8_t *c, int c_stride) {
+  fn((uint16_t *)a, (const uint16_t *)b, w, h, (const uint16_t *)c, c_stride);
+}
+
+using AvgPredTestHBD = AvgPredTest<12, uint16_t>;
+
+TEST_P(AvgPredTestHBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestHBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestHBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_neon>));
+#endif  // HAVE_NEON
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/config_test.cc b/test/config_test.cc
index 8f4c60e113..729b01151b 100644
--- a/test/config_test.cc
+++ b/test/config_test.cc
@@ -22,24 +22,24 @@ class ConfigTest
   ConfigTest()
       : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0),
         frame_count_max_(0) {}
-  virtual ~ConfigTest() {}
+  ~ConfigTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     frame_count_in_ = 0;
     frame_count_out_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) override {
     ++frame_count_in_;
     abort_ |= (frame_count_in_ >= frame_count_max_);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {
+  void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override {
     ++frame_count_out_;
   }
 
diff --git a/test/consistency_test.cc b/test/consistency_test.cc
index f31fd8c923..5e872e70a8 100644
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -54,25 +54,25 @@ class ConsistencyTestBase : public ::testing::Test {
   static void ClearSsim() { memset(ssim_array_, 0, kDataBufferSize / 16); }
   static void TearDownTestSuite() {
     vpx_free(source_data_[0]);
-    source_data_[0] = NULL;
+    source_data_[0] = nullptr;
     vpx_free(reference_data_[0]);
-    reference_data_[0] = NULL;
+    reference_data_[0] = nullptr;
     vpx_free(source_data_[1]);
-    source_data_[1] = NULL;
+    source_data_[1] = nullptr;
     vpx_free(reference_data_[1]);
-    reference_data_[1] = NULL;
+    reference_data_[1] = nullptr;
 
     delete[] ssim_array_;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   // Handle frames up to 640x480
   static const int kDataAlignment = 16;
   static const int kDataBufferSize = 640 * 480;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -145,9 +145,9 @@ class ConsistencyVP9Test
 };
 #endif  // CONFIG_VP9_ENCODER
 
-uint8_t *ConsistencyTestBase::source_data_[2] = { NULL, NULL };
-uint8_t *ConsistencyTestBase::reference_data_[2] = { NULL, NULL };
-Ssimv *ConsistencyTestBase::ssim_array_ = NULL;
+uint8_t *ConsistencyTestBase::source_data_[2] = { nullptr, nullptr };
+uint8_t *ConsistencyTestBase::reference_data_[2] = { nullptr, nullptr };
+Ssimv *ConsistencyTestBase::ssim_array_ = nullptr;
 
 #if CONFIG_VP9_ENCODER
 TEST_P(ConsistencyVP9Test, ConsistencyIsZero) {
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 6eef26f93a..ffd5c41c63 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -244,7 +244,7 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
 
   // Vertical pass (transposed intermediate -> dst).
   {
-    uint16_t *src_ptr = intermediate_buffer;
+    src_ptr = intermediate_buffer;
     const int dst_next_row_stride = dst_stride - output_width;
     unsigned int i, j;
     for (i = 0; i < output_height; ++i) {
@@ -361,22 +361,22 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 #endif
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
   static void TearDownTestSuite() {
     vpx_free(input_ - 1);
-    input_ = NULL;
+    input_ = nullptr;
     vpx_free(output_);
-    output_ = NULL;
+    output_ = nullptr;
     vpx_free(output_ref_);
-    output_ref_ = NULL;
+    output_ref_ = nullptr;
 #if CONFIG_VP9_HIGHBITDEPTH
     vpx_free(input16_ - 1);
-    input16_ = NULL;
+    input16_ = nullptr;
     vpx_free(output16_);
-    output16_ = NULL;
+    output16_ = nullptr;
     vpx_free(output16_ref_);
-    output16_ref_ = NULL;
+    output16_ref_ = nullptr;
 #endif
   }
 
@@ -403,7 +403,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
             i % kOuterBlockSize >= (BorderLeft() + Width()));
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     UUT_ = GET_PARAM(2);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ != 0) {
@@ -541,13 +541,13 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 #endif
 };
 
-uint8_t *ConvolveTest::input_ = NULL;
-uint8_t *ConvolveTest::output_ = NULL;
-uint8_t *ConvolveTest::output_ref_ = NULL;
+uint8_t *ConvolveTest::input_ = nullptr;
+uint8_t *ConvolveTest::output_ = nullptr;
+uint8_t *ConvolveTest::output_ref_ = nullptr;
 #if CONFIG_VP9_HIGHBITDEPTH
-uint16_t *ConvolveTest::input16_ = NULL;
-uint16_t *ConvolveTest::output16_ = NULL;
-uint16_t *ConvolveTest::output16_ref_ = NULL;
+uint16_t *ConvolveTest::input16_ = nullptr;
+uint16_t *ConvolveTest::output16_ = nullptr;
+uint16_t *ConvolveTest::output16_ref_ = nullptr;
 #endif
 
 TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
@@ -562,7 +562,7 @@ TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
 
   vpx_usec_timer_start(&timer);
   for (int n = 0; n < kNumTests; ++n) {
-    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
+    UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0,
                    width, height);
   }
   vpx_usec_timer_mark(&timer);
@@ -582,7 +582,7 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
 
   vpx_usec_timer_start(&timer);
   for (int n = 0; n < kNumTests; ++n) {
-    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
+    UUT_->copy_[1](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0,
                    width, height);
   }
   vpx_usec_timer_mark(&timer);
@@ -780,7 +780,8 @@ TEST_P(ConvolveTest, Copy) {
   uint8_t *const out = output();
 
   ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, 0, 0, 0, Width(), Height()));
+                                          nullptr, 0, 0, 0, 0, Width(),
+                                          Height()));
 
   CheckGuardBlocks();
 
@@ -799,7 +800,8 @@ TEST_P(ConvolveTest, Avg) {
   CopyOutputToRef();
 
   ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, 0, 0, 0, Width(), Height()));
+                                          nullptr, 0, 0, 0, 0, Width(),
+                                          Height()));
 
   CheckGuardBlocks();
 
@@ -955,9 +957,9 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
                 UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters,
                              filter_x, 16, 0, 16, Width(), Height()));
           else
-            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out,
-                                                    kOutputStride, NULL, 0, 0,
-                                                    0, 0, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->copy_[i](in, kInputStride, out, kOutputStride, nullptr, 0,
+                               0, 0, 0, Width(), Height()));
 
           CheckGuardBlocks();
 
@@ -1053,9 +1055,9 @@ TEST_P(ConvolveTest, FilterExtremes) {
                   UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters,
                                filter_x, 16, 0, 16, Width(), Height()));
             else
-              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out,
-                                                      kOutputStride, NULL, 0, 0,
-                                                      0, 0, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr,
+                                 0, 0, 0, 0, Width(), Height()));
 
             for (int y = 0; y < Height(); ++y) {
               for (int x = 0; x < Width(); ++x)
@@ -1421,6 +1423,36 @@ INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest,
                          ::testing::ValuesIn(kArrayConvolve_neon));
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+const ConvolveFunctions convolve8_neon_dotprod(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_dotprod,
+    vpx_convolve8_avg_horiz_neon_dotprod, vpx_convolve8_vert_neon_dotprod,
+    vpx_convolve8_avg_vert_neon_dotprod, vpx_convolve8_neon_dotprod,
+    vpx_convolve8_avg_neon_dotprod, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c,
+    vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve_neon_dotprod[] = { ALL_SIZES(
+    convolve8_neon_dotprod) };
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_neon_dotprod));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_NEON_I8MM
+const ConvolveFunctions convolve8_neon_i8mm(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm,
+    vpx_convolve8_avg_horiz_neon_i8mm, vpx_convolve8_vert_neon_i8mm,
+    vpx_convolve8_avg_vert_neon_i8mm, vpx_convolve8_neon_i8mm,
+    vpx_convolve8_avg_neon_i8mm, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c,
+    vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve_neon_i8mm[] = { ALL_SIZES(
+    convolve8_neon_i8mm) };
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_neon_i8mm));
+#endif  // HAVE_NEON_I8MM
+
 #if HAVE_DSPR2
 const ConvolveFunctions convolve8_dspr2(
     vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, vpx_convolve8_horiz_dspr2,
@@ -1447,6 +1479,19 @@ INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest,
                          ::testing::ValuesIn(kArrayConvolve8_msa));
 #endif  // HAVE_MSA
 
+#if HAVE_LSX
+const ConvolveFunctions convolve8_lsx(
+    vpx_convolve_copy_lsx, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx,
+    vpx_convolve8_avg_horiz_lsx, vpx_convolve8_vert_lsx,
+    vpx_convolve8_avg_vert_lsx, vpx_convolve8_lsx, vpx_convolve8_avg_lsx,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve8_lsx[] = { ALL_SIZES(convolve8_lsx) };
+INSTANTIATE_TEST_SUITE_P(LSX, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_lsx));
+#endif  // HAVE_LSX
+
 #if HAVE_VSX
 const ConvolveFunctions convolve8_vsx(
     vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx,
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index a7623f09ac..22f4552963 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -26,9 +26,9 @@ class CpuSpeedTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
         tune_content_(VP9E_CONTENT_DEFAULT) {}
-  virtual ~CpuSpeedTest() {}
+  ~CpuSpeedTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -40,10 +40,10 @@ class CpuSpeedTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; }
+  void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
@@ -56,7 +56,7 @@ class CpuSpeedTest
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
   }
 
diff --git a/test/cq_test.cc b/test/cq_test.cc
index 292adb0d04..b74915a336 100644
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -50,21 +50,21 @@ class CQTest : public ::libvpx_test::EncoderTest,
     init_flags_ = VPX_CODEC_USE_PSNR;
   }
 
-  virtual ~CQTest() {}
+  ~CQTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     file_size_ = 0;
     psnr_ = 0.0;
     n_frames_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       if (cfg_.rc_end_usage == VPX_CQ) {
         encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_);
@@ -73,12 +73,12 @@ class CQTest : public ::libvpx_test::EncoderTest,
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0);
     n_frames_++;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     file_size_ += pkt->data.frame.sz;
   }
 
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index d586db7208..8c4213ee16 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -27,6 +27,7 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/msvc.h"  // for round()
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -309,7 +310,7 @@ void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
 
 class Trans16x16TestBase {
  public:
-  virtual ~Trans16x16TestBase() {}
+  virtual ~Trans16x16TestBase() = default;
 
  protected:
   virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
@@ -548,12 +549,50 @@ class Trans16x16TestBase {
     }
   }
 
+  void RunSpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    int c_sum_time = 0;
+    int simd_sum_time = 0;
+
+    DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]);
+
+    // Initialize a test block with input range [-mask_, mask_].
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    vpx_usec_timer timer_c;
+    vpx_usec_timer_start(&timer_c);
+    for (int i = 0; i < count_test_block; ++i) {
+      vpx_fdct16x16_c(input_block, output_ref_block, pitch_);
+    }
+    vpx_usec_timer_mark(&timer_c);
+    c_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_c));
+
+    vpx_usec_timer timer_mod;
+    vpx_usec_timer_start(&timer_mod);
+    for (int i = 0; i < count_test_block; ++i) {
+      RunFwdTxfm(input_block, output_block, pitch_);
+    }
+
+    vpx_usec_timer_mark(&timer_mod);
+    simd_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_mod));
+
+    printf(
+        "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+        simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
   void CompareInvReference(IdctFunc ref_txfm, int thresh) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 10000;
     const int eob = 10;
     const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
     DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
     DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -604,6 +643,80 @@ class Trans16x16TestBase {
     }
   }
 
+  void RunInvTrans16x16SpeedTest(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 10;
+    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
+    int64_t c_sum_time = 0;
+    int64_t simd_sum_time = 0;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (j < eob) {
+        // Random values less than the threshold, either positive or negative
+        coeff[scan[j]] = rnd(thresh);
+      } else {
+        coeff[scan[j]] = 0;
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        dst[j] = 0;
+        ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        dst16[j] = 0;
+        ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+    if (bit_depth_ == VPX_BITS_8) {
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        ref_txfm(coeff, ref, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, dst, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+    } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    printf(
+        "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n",
+        c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
   int pitch_;
   int tx_type_;
   vpx_bit_depth_t bit_depth_;
@@ -615,9 +728,9 @@ class Trans16x16TestBase {
 class Trans16x16DCT : public Trans16x16TestBase,
                       public ::testing::TestWithParam<Dct16x16Param> {
  public:
-  virtual ~Trans16x16DCT() {}
+  ~Trans16x16DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -636,13 +749,13 @@ class Trans16x16DCT : public Trans16x16TestBase,
     inv_txfm_ref = idct16x16_ref;
 #endif
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -664,12 +777,14 @@ TEST_P(Trans16x16DCT, QuantCheck) {
 
 TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 
+TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); }
+
 class Trans16x16HT : public Trans16x16TestBase,
                      public ::testing::TestWithParam<Ht16x16Param> {
  public:
-  virtual ~Trans16x16HT() {}
+  ~Trans16x16HT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -688,13 +803,13 @@ class Trans16x16HT : public Trans16x16TestBase,
     inv_txfm_ref = iht16x16_ref;
 #endif
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -717,9 +832,9 @@ TEST_P(Trans16x16HT, QuantCheck) {
 class InvTrans16x16DCT : public Trans16x16TestBase,
                          public ::testing::TestWithParam<Idct16x16Param> {
  public:
-  virtual ~InvTrans16x16DCT() {}
+  ~InvTrans16x16DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     thresh_ = GET_PARAM(2);
@@ -727,11 +842,12 @@ class InvTrans16x16DCT : public Trans16x16TestBase,
     pitch_ = 16;
     mask_ = (1 << bit_depth_) - 1;
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/,
+                  int /*stride*/) override {}
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -739,11 +855,16 @@ class InvTrans16x16DCT : public Trans16x16TestBase,
   IdctFunc inv_txfm_;
   int thresh_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT);
 
 TEST_P(InvTrans16x16DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
 
+TEST_P(InvTrans16x16DCT, DISABLED_Speed) {
+  RunInvTrans16x16SpeedTest(ref_txfm_, thresh_);
+}
+
 using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -784,15 +905,31 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(C, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(&vpx_idct16x16_256_add_c,
+                                                      &vpx_idct16x16_256_add_c,
+                                                      6225, VPX_BITS_8)));
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     NEON, Trans16x16DCT,
     ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
                                  &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
 #endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    NEON, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_c, 0,
+                   VPX_BITS_8)));
+#endif  // HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16DCT,
@@ -808,8 +945,25 @@ INSTANTIATE_TEST_SUITE_P(
                                  2, VPX_BITS_8),
                       make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
                                  3, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(
+                             &vpx_idct16x16_256_add_c,
+                             &vpx_idct16x16_256_add_sse2, 6225, VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, Trans16x16DCT,
+    ::testing::Values(make_tuple(&vpx_fdct16x16_avx2,
+                                 &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(
+                             &vpx_idct16x16_256_add_c,
+                             &vpx_idct16x16_256_add_avx2, 6225, VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16DCT,
@@ -865,4 +1019,11 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_vsx,
                                  0, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, Trans16x16DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct16x16_lsx,
+                                                      &vpx_idct16x16_256_add_c,
+                                                      0, VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 8398e17e81..6233b17a43 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -24,10 +24,12 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/msvc.h"  // for round()
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -71,6 +73,9 @@ typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 typedef std::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
     Trans32x32Param;
 
+typedef std::tuple<InvTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t, int, int>
+    InvTrans32x32Param;
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
@@ -84,8 +89,8 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
 class Trans32x32Test : public AbstractBench,
                        public ::testing::TestWithParam<Trans32x32Param> {
  public:
-  virtual ~Trans32x32Test() {}
-  virtual void SetUp() {
+  ~Trans32x32Test() override = default;
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     version_ = GET_PARAM(2);  // 0: high precision forward transform
@@ -94,7 +99,7 @@ class Trans32x32Test : public AbstractBench,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int version_;
@@ -105,7 +110,7 @@ class Trans32x32Test : public AbstractBench,
 
   int16_t *bench_in_;
   tran_low_t *bench_out_;
-  virtual void Run();
+  void Run() override;
 };
 
 void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); }
@@ -314,6 +319,174 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
   }
 }
 
+class InvTrans32x32Test : public ::testing::TestWithParam<InvTrans32x32Param> {
+ public:
+  ~InvTrans32x32Test() override = default;
+  void SetUp() override {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_ = GET_PARAM(2);  // 0: high precision forward transform
+                              // 1: low precision version for rd loop
+    bit_depth_ = GET_PARAM(3);
+    eob_ = GET_PARAM(4);
+    thresh_ = GET_PARAM(4);
+    mask_ = (1 << bit_depth_) - 1;
+    pitch_ = 32;
+  }
+
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    ref_txfm_(out, dst, stride);
+  }
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+  int version_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  int eob_;
+  int thresh_;
+
+  InvTxfmFunc ref_txfm_;
+  InvTxfmFunc inv_txfm_;
+  int pitch_;
+
+  void RunInvTrans32x32SpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    int64_t c_sum_time = 0;
+    int64_t simd_sum_time = 0;
+    const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (j < eob_) {
+        // Random values less than the threshold, either positive or negative
+        coeff[scan[j]] = rnd(thresh_);
+      } else {
+        coeff[scan[j]] = 0;
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        dst[j] = 0;
+        ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        dst16[j] = 0;
+        ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+    if (bit_depth_ == VPX_BITS_8) {
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunRefTxfm(coeff, ref, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, dst, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+    } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    printf(
+        "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n",
+        c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
+  void CompareInvReference32x32() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 31;
+    const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          coeff[scan[j]] = rnd.Rand8Extremes();
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        RunRefTxfm(coeff, ref, pitch_);
+        RunInvTxfm(coeff, dst, pitch_);
+      } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+        RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error) << "Error: 32x32 IDCT Comparison has error "
+                             << error << " at index " << j;
+      }
+    }
+  }
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans32x32Test);
+
+TEST_P(InvTrans32x32Test, DISABLED_Speed) { RunInvTrans32x32SpeedTest(); }
+TEST_P(InvTrans32x32Test, CompareReference) { CompareInvReference32x32(); }
+
 using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -334,6 +507,14 @@ INSTANTIATE_TEST_SUITE_P(
                                  VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c,
                                  1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    C, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_c, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_c, 0,
+                   VPX_BITS_8, 16, 6255)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
@@ -352,6 +533,14 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_sse2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_sse2, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_sse2, 0,
+                   VPX_BITS_8, 16, 6225)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -377,6 +566,14 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_avx2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_avx2, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_avx2, 0,
+                   VPX_BITS_8, 16, 6225)));
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -396,4 +593,13 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&vpx_fdct32x32_rd_vsx,
                                  &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Trans32x32Test,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_lsx,
+                                 &vpx_idct32x32_1024_add_lsx, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_lsx,
+                                 &vpx_idct32x32_1024_add_lsx, 1, VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc
index 2fd176f25a..ec6f543f71 100644
--- a/test/dct_partial_test.cc
+++ b/test/dct_partial_test.cc
@@ -39,7 +39,7 @@ typedef tuple<PartialFdctFunc, int /* size */, vpx_bit_depth_t>
 
 tran_low_t partial_fdct_ref(const Buffer<int16_t> &in, int size) {
   int64_t sum = 0;
-  if (in.TopLeftPixel() != NULL) {
+  if (in.TopLeftPixel() != nullptr) {
     for (int y = 0; y < size; ++y) {
       for (int x = 0; x < size; ++x) {
         sum += in.TopLeftPixel()[y * in.stride() + x];
@@ -67,7 +67,7 @@ class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   void RunTest() {
@@ -81,7 +81,7 @@ class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
     Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
     ASSERT_TRUE(output_block.Init());
 
-    if (output_block.TopLeftPixel() != NULL) {
+    if (output_block.TopLeftPixel() != nullptr) {
       for (int i = 0; i < 100; ++i) {
         if (i == 0) {
           input_block.Set(maxvalue);
@@ -145,11 +145,17 @@ INSTANTIATE_TEST_SUITE_P(
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
     NEON, PartialFdctTest,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
-                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_8),
                       make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12),
                       make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10),
                       make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_12),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_10),
                       make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
 #else
 INSTANTIATE_TEST_SUITE_P(
diff --git a/test/dct_test.cc b/test/dct_test.cc
index a1b766e103..c3d3081c42 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -134,7 +134,7 @@ void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
 
 class TransTestBase : public ::testing::TestWithParam<DctParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     const int idx = GET_PARAM(0);
     const FuncInfo *func_info = &(GET_PARAM(1)[idx]);
@@ -160,17 +160,17 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> {
 
     src_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(16, pixel_size_ * block_size_));
-    ASSERT_TRUE(src_ != NULL);
+    ASSERT_NE(src_, nullptr);
     dst_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(16, pixel_size_ * block_size_));
-    ASSERT_TRUE(dst_ != NULL);
+    ASSERT_NE(dst_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(src_);
-    src_ = NULL;
+    src_ = nullptr;
     vpx_free(dst_);
-    dst_ = NULL;
+    dst_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -211,7 +211,7 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> {
     Buffer<int16_t> test_input_block =
         Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
     ASSERT_TRUE(test_input_block.Init());
-    ASSERT_TRUE(test_input_block.TopLeftPixel() != NULL);
+    ASSERT_NE(test_input_block.TopLeftPixel(), nullptr);
     Buffer<tran_low_t> test_temp_block =
         Buffer<tran_low_t>(size_, size_, 0, 16);
     ASSERT_TRUE(test_temp_block.Init());
@@ -316,7 +316,7 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> {
       } else if (i == 1) {
         input_extreme_block.Set(-max_pixel_value_);
       } else {
-        ASSERT_TRUE(input_extreme_block.TopLeftPixel() != NULL);
+        ASSERT_NE(input_extreme_block.TopLeftPixel(), nullptr);
         for (int h = 0; h < size_; ++h) {
           for (int w = 0; w < size_; ++w) {
             input_extreme_block
@@ -331,7 +331,7 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> {
 
       // The minimum quant value is 4.
       EXPECT_TRUE(output_block.CheckValues(output_ref_block));
-      ASSERT_TRUE(output_block.TopLeftPixel() != NULL);
+      ASSERT_NE(output_block.TopLeftPixel(), nullptr);
       for (int h = 0; h < size_; ++h) {
         for (int w = 0; w < size_; ++w) {
           EXPECT_GE(
@@ -358,18 +358,10 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> {
     ASSERT_TRUE(in.Init());
     Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16);
     ASSERT_TRUE(coeff.Init());
-    Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(dst.Init());
-    Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0);
-    ASSERT_TRUE(src.Init());
-    Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(dst16.Init());
-    Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0);
-    ASSERT_TRUE(src16.Init());
 
     for (int i = 0; i < count_test_block; ++i) {
       InitMem();
-      ASSERT_TRUE(in.TopLeftPixel() != NULL);
+      ASSERT_NE(in.TopLeftPixel(), nullptr);
       // Initialize a test block with input range [-max_pixel_value_,
       // max_pixel_value_].
       for (int h = 0; h < size_; ++h) {
@@ -539,6 +531,18 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT,
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_neon_func_info[] = {
+  { &fdct_wrapper<vpx_highbd_fdct4x4_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_neon>, 4, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
+  /* { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
+       &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
+};
+#else
 static const FuncInfo dct_neon_func_info[4] = {
   { &fdct_wrapper<vpx_fdct4x4_neon>, &idct_wrapper<vpx_idct4x4_16_add_neon>, 4,
     1 },
@@ -549,12 +553,15 @@ static const FuncInfo dct_neon_func_info[4] = {
   { &fdct_wrapper<vpx_fdct32x32_neon>,
     &idct_wrapper<vpx_idct32x32_1024_add_neon>, 32, 1 }
 };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, TransDCT,
-    ::testing::Combine(::testing::Range(0, 4),
-                       ::testing::Values(dct_neon_func_info),
-                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_neon_func_info) /
+                                             sizeof(dct_neon_func_info[0]))),
+        ::testing::Values(dct_neon_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
@@ -586,6 +593,23 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransDCT,
                                                       VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
 
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_lsx_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_lsx>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
+  { &fdct_wrapper<vpx_fdct8x8_lsx>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
+  { &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
+    16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
+    32, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, TransDCT,
+    ::testing::Combine(::testing::Range(0, 4),
+                       ::testing::Values(dct_lsx_func_info),
+                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !CONFIG_EMULATE_HARDWARE
 
 /* -------------------------------------------------------------------------- */
@@ -635,14 +659,23 @@ static const FuncInfo ht_neon_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
   { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
     2 },
+  { &vp9_highbd_fht4x4_neon, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>,
+    4, 2 },
   { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
     2 },
+  { &vp9_highbd_fht8x8_neon, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>,
+    8, 2 },
   { &vp9_highbd_fht16x16_c,
     &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
+  { &vp9_highbd_fht16x16_neon,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
 #endif
   { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
+  { &vp9_fht4x4_neon, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
   { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
-  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
+  { &vp9_fht8x8_neon, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 },
+  { &vp9_fht16x16_neon, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
 };
 
 INSTANTIATE_TEST_SUITE_P(
@@ -753,4 +786,5 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransWHT,
                          ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0,
                                                       VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_EMULATE_HARDWARE
+
 }  // namespace
diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index 9e82ace1b8..44e4397726 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@@ -20,7 +20,7 @@ namespace {
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 
 TEST(DecodeAPI, InvalidParams) {
-  static const vpx_codec_iface_t *kCodecs[] = {
+  static vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_DECODER
     &vpx_codec_vp8_dx_algo,
 #endif
@@ -120,7 +120,7 @@ void TestVp9Controls(vpx_codec_ctx_t *dec) {
 }
 
 TEST(DecodeAPI, Vp9InvalidDecode) {
-  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
   const char filename[] =
       "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf";
   libvpx_test::IVFVideoSource video(filename);
@@ -147,7 +147,7 @@ TEST(DecodeAPI, Vp9InvalidDecode) {
 
 void TestPeekInfo(const uint8_t *const data, uint32_t data_sz,
                   uint32_t peek_size) {
-  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
   // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
   // to decoder_peek_si_internal on frames of size < 8.
   if (data_sz >= 8) {
diff --git a/test/decode_corrupted.cc b/test/decode_corrupted.cc
index 31e1da69cc..58773d7b86 100644
--- a/test/decode_corrupted.cc
+++ b/test/decode_corrupted.cc
@@ -28,9 +28,9 @@ class DecodeCorruptedFrameTest
   DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {}
 
  protected:
-  virtual ~DecodeCorruptedFrameTest() {}
+  ~DecodeCorruptedFrameTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     cfg_.g_lag_in_frames = 0;
@@ -44,16 +44,16 @@ class DecodeCorruptedFrameTest
     dec_cfg_.threads = 1;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7);
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {}
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {}
 
-  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
-      const vpx_codec_cx_pkt_t *pkt) {
+  const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) override {
     // Don't edit frame packet on key frame.
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt;
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
@@ -66,9 +66,9 @@ class DecodeCorruptedFrameTest
     return &modified_pkt_;
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError();
     return VPX_CODEC_MEM_ERROR != res_dec;
   }
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 34d53862a3..383fd2d896 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -87,7 +87,7 @@ TEST_P(DecodePerfTest, PerfTest) {
   vpx_usec_timer t;
   vpx_usec_timer_start(&t);
 
-  for (video.Begin(); video.cxdata() != NULL; video.Next()) {
+  for (video.Begin(); video.cxdata() != nullptr; video.Next()) {
     decoder.DecodeFrame(video.cxdata(), video.frame_size());
   }
 
@@ -116,11 +116,11 @@ class VP9NewEncodeDecodePerfTest
  protected:
   VP9NewEncodeDecodePerfTest()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
-        outfile_(0), out_frames_(0) {}
+        outfile_(nullptr), out_frames_(0) {}
 
-  virtual ~VP9NewEncodeDecodePerfTest() {}
+  ~VP9NewEncodeDecodePerfTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -137,8 +137,8 @@ class VP9NewEncodeDecodePerfTest
     cfg_.rc_end_usage = VPX_VBR;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, speed_);
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
@@ -146,24 +146,24 @@ class VP9NewEncodeDecodePerfTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH");
     const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
     outfile_ = fopen(path_to_source.c_str(), "wb");
-    ASSERT_TRUE(outfile_ != NULL);
+    ASSERT_NE(outfile_, nullptr);
   }
 
-  virtual void EndPassHook() {
-    if (outfile_ != NULL) {
+  void EndPassHook() override {
+    if (outfile_ != nullptr) {
       if (!fseek(outfile_, 0, SEEK_SET)) {
         ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
       }
       fclose(outfile_);
-      outfile_ = NULL;
+      outfile_ = nullptr;
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -177,7 +177,7 @@ class VP9NewEncodeDecodePerfTest
               pkt->data.frame.sz);
   }
 
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
   void set_speed(unsigned int speed) { speed_ = speed; }
 
@@ -236,7 +236,7 @@ TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) {
   vpx_usec_timer t;
   vpx_usec_timer_start(&t);
 
-  for (decode_video.Begin(); decode_video.cxdata() != NULL;
+  for (decode_video.Begin(); decode_video.cxdata() != nullptr;
        decode_video.Next()) {
     decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size());
   }
diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc
index be0f32545e..7098e7b270 100644
--- a/test/decode_svc_test.cc
+++ b/test/decode_svc_test.cc
@@ -25,17 +25,16 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest,
                       public ::libvpx_test::CodecTestWithParam<const char *> {
  protected:
   DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {}
-  virtual ~DecodeSvcTest() {}
+  ~DecodeSvcTest() override = default;
 
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (video.frame_number() == 0)
       decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_);
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_EQ(img.d_w, width_);
     ASSERT_EQ(img.d_h, height_);
     total_frames_ = frame_number;
@@ -56,7 +55,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) {
   const std::string filename = GET_PARAM(1);
   std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
   total_frames_ = 0;
   spatial_layer_ = 0;
@@ -73,7 +72,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) {
   const std::string filename = GET_PARAM(1);
   std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
   total_frames_ = 0;
   spatial_layer_ = 1;
@@ -90,7 +89,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) {
   const std::string filename = GET_PARAM(1);
   std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
   total_frames_ = 0;
   spatial_layer_ = 2;
@@ -108,7 +107,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) {
   const std::string filename = GET_PARAM(1);
   std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
   total_frames_ = 0;
   spatial_layer_ = 10;
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index ae23587759..773d673d37 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -26,7 +26,7 @@ vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
 }
 
 vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
-  return DecodeFrame(cxdata, size, NULL);
+  return DecodeFrame(cxdata, size, nullptr);
 }
 
 vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
@@ -67,7 +67,7 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder,
 void DecoderTest::RunLoop(CompressedVideoSource *video,
                           const vpx_codec_dec_cfg_t &dec_cfg) {
   Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_);
-  ASSERT_TRUE(decoder != NULL);
+  ASSERT_NE(decoder, nullptr);
   bool end_of_file = false;
 
   // Decode frames.
@@ -78,7 +78,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
     vpx_codec_stream_info_t stream_info;
     stream_info.sz = sizeof(stream_info);
 
-    if (video->cxdata() != NULL) {
+    if (video->cxdata() != nullptr) {
       const vpx_codec_err_t res_peek = decoder->PeekStream(
           video->cxdata(), video->frame_size(), &stream_info);
       HandlePeekResult(decoder, video, res_peek);
@@ -89,13 +89,13 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
       if (!HandleDecodeResult(res_dec, *video, decoder)) break;
     } else {
       // Signal end of the file to the decoder.
-      const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+      const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0);
       ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
       end_of_file = true;
     }
 
     DxDataIterator dec_iter = decoder->GetDxData();
-    const vpx_image_t *img = NULL;
+    const vpx_image_t *img = nullptr;
 
     // Get decompressed data
     while (!::testing::Test::HasFailure() && (img = dec_iter.Next())) {
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 04876cdd7c..f446ab4664 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -24,7 +24,7 @@ class CompressedVideoSource;
 class DxDataIterator {
  public:
   explicit DxDataIterator(vpx_codec_ctx_t *decoder)
-      : decoder_(decoder), iter_(NULL) {}
+      : decoder_(decoder), iter_(nullptr) {}
 
   const vpx_image_t *Next() { return vpx_codec_get_frame(decoder_, &iter_); }
 
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 87e29b61db..508083673a 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -8,25 +8,43 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <climits>
+#include <cstring>
+#include <initializer_list>
+#include <new>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
 
 #include "./vpx_config.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_encoder.h"
+#include "vpx/vpx_image.h"
+#include "vpx/vpx_tpl.h"
 
 namespace {
 
-#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
-
-TEST(EncodeAPI, InvalidParams) {
-  static const vpx_codec_iface_t *kCodecs[] = {
+vpx_codec_iface_t *kCodecIfaces[] = {
 #if CONFIG_VP8_ENCODER
-    &vpx_codec_vp8_cx_algo,
+  &vpx_codec_vp8_cx_algo,
 #endif
 #if CONFIG_VP9_ENCODER
-    &vpx_codec_vp9_cx_algo,
+  &vpx_codec_vp9_cx_algo,
 #endif
-  };
+};
+
+bool IsVP9(vpx_codec_iface_t *iface) {
+  static const char kVP9Name[] = "WebM Project VP9";
+  return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
+         0;
+}
+
+TEST(EncodeAPI, InvalidParams) {
   uint8_t buf[1] = { 0 };
   vpx_image_t img;
   vpx_codec_ctx_t enc;
@@ -34,29 +52,33 @@ TEST(EncodeAPI, InvalidParams) {
 
   EXPECT_EQ(&img, vpx_img_wrap(&img, VPX_IMG_FMT_I420, 1, 1, 1, buf));
 
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(NULL, NULL, NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(&enc, NULL, NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, NULL, 0, 0, 0, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, &img, 0, 0, 0, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL));
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-            vpx_codec_enc_config_default(NULL, NULL, 0));
+            vpx_codec_enc_init(nullptr, nullptr, nullptr, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_enc_init(&enc, nullptr, nullptr, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_encode(nullptr, nullptr, 0, 0, 0, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_encode(nullptr, &img, 0, 0, 0, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(nullptr));
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-            vpx_codec_enc_config_default(NULL, &cfg, 0));
-  EXPECT_TRUE(vpx_codec_error(NULL) != NULL);
+            vpx_codec_enc_config_default(nullptr, nullptr, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_enc_config_default(nullptr, &cfg, 0));
+  EXPECT_NE(vpx_codec_error(nullptr), nullptr);
 
-  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
-    SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i]));
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_init(NULL, kCodecs[i], NULL, 0));
+              vpx_codec_enc_init(nullptr, iface, nullptr, 0));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_init(&enc, kCodecs[i], NULL, 0));
+              vpx_codec_enc_init(&enc, iface, nullptr, 0));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_config_default(kCodecs[i], &cfg, 1));
+              vpx_codec_enc_config_default(iface, &cfg, 1));
 
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0));
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, NULL, 0, 0, 0, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(iface, &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, iface, &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, nullptr, 0, 0, 0, 0));
 
     EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
   }
@@ -104,6 +126,93 @@ TEST(EncodeAPI, ImageSizeSetting) {
 
   vpx_codec_destroy(&enc);
 }
+
+// Verifies the fix for a float-cast-overflow in vp8_change_config().
+//
+// Causes cpi->framerate to become the largest possible value (10,000,000) in
+// VP8 by setting cfg.g_timebase to 1/10000000 and passing a duration of 1 to
+// vpx_codec_encode().
+TEST(EncodeAPI, HugeFramerateVp8) {
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  cfg.g_w = 271;
+  cfg.g_h = 1080;
+  cfg.g_timebase.num = 1;
+  // Largest value (VP8's TICKS_PER_SEC) such that frame duration is nonzero (1
+  // tick).
+  cfg.g_timebase.den = 10000000;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = VPX_CBR;
+
+  vpx_codec_ctx_t enc;
+  // Before we encode the first frame, cpi->framerate is set to a guess (the
+  // reciprocal of cfg.g_timebase). If this guess doesn't seem reasonable
+  // (> 180), cpi->framerate is set to 30.
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -12), VPX_CODEC_OK);
+
+  vpx_image_t *const image =
+      vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1);
+  ASSERT_NE(image, nullptr);
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+
+  // Encode a frame.
+  // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This
+  // causes cpi->framerate to become 10,000,000.
+  ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_REALTIME),
+            VPX_CODEC_OK);
+
+  // Change to the same config. Since cpi->framerate is now huge, when it is
+  // used to calculate raw_target_rate (bit rate of uncompressed frames), the
+  // result is likely to overflow an unsigned int.
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK);
+
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+// A test that reproduces https://crbug.com/webm/1831.
+TEST(EncodeAPI, RandomPixelsVp8) {
+  // Initialize libvpx encoder
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.rc_target_bitrate = 2000;
+  cfg.g_w = 1280;
+  cfg.g_h = 720;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  // Generate random frame data and encode
+  uint8_t img[1280 * 720 * 3 / 2];
+  libvpx_test::ACMRandom rng;
+  for (size_t i = 0; i < sizeof(img); ++i) {
+    img[i] = rng.Rand8();
+  }
+  vpx_image_t img_wrapper;
+  ASSERT_EQ(
+      vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img),
+      &img_wrapper);
+  ASSERT_EQ(vpx_codec_encode(&enc, &img_wrapper, 0, 1, 0, VPX_DL_BEST_QUALITY),
+            VPX_CODEC_OK);
+
+  // Destroy libvpx encoder
+  vpx_codec_destroy(&enc);
+}
 #endif
 
 // Set up 2 spatial streams with 2 temporal layers per stream, and generate
@@ -111,14 +220,6 @@ TEST(EncodeAPI, ImageSizeSetting) {
 // (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
 // CONFIG_MULTI_RES_ENCODING.
 TEST(EncodeAPI, MultiResEncode) {
-  static const vpx_codec_iface_t *kCodecs[] = {
-#if CONFIG_VP8_ENCODER
-    &vpx_codec_vp8_cx_algo,
-#endif
-#if CONFIG_VP9_ENCODER
-    &vpx_codec_vp9_cx_algo,
-#endif
-  };
   const int width = 1280;
   const int height = 720;
   const int width_down = width / 2;
@@ -126,8 +227,7 @@ TEST(EncodeAPI, MultiResEncode) {
   const int target_bitrate = 1000;
   const int framerate = 30;
 
-  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
-    const vpx_codec_iface_t *const iface = kCodecs[c];
+  for (const auto *iface : kCodecIfaces) {
     vpx_codec_ctx_t enc[2];
     vpx_codec_enc_cfg_t cfg[2];
     vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
@@ -180,10 +280,7 @@ TEST(EncodeAPI, MultiResEncode) {
     }
 
     // VP9 should report incapable, VP8 invalid for all configurations.
-    const char kVP9Name[] = "WebM Project VP9";
-    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
-                                sizeof(kVP9Name) - 1) == 0;
-    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+    EXPECT_EQ(IsVP9(iface) ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
               vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
 
     for (int i = 0; i < 2; i++) {
@@ -192,4 +289,661 @@ TEST(EncodeAPI, MultiResEncode) {
   }
 }
 
+TEST(EncodeAPI, SetRoi) {
+  static struct {
+    vpx_codec_iface_t *iface;
+    int ctrl_id;
+  } kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    { &vpx_codec_vp8_cx_algo, VP8E_SET_ROI_MAP },
+#endif
+#if CONFIG_VP9_ENCODER
+    { &vpx_codec_vp9_cx_algo, VP9E_SET_ROI_MAP },
+#endif
+  };
+  constexpr int kWidth = 64;
+  constexpr int kHeight = 64;
+
+  for (const auto &codec : kCodecs) {
+    SCOPED_TRACE(vpx_codec_iface_name(codec.iface));
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    EXPECT_EQ(vpx_codec_enc_config_default(codec.iface, &cfg, 0), VPX_CODEC_OK);
+    cfg.g_w = kWidth;
+    cfg.g_h = kHeight;
+    EXPECT_EQ(vpx_codec_enc_init(&enc, codec.iface, &cfg, 0), VPX_CODEC_OK);
+
+    vpx_roi_map_t roi = {};
+    uint8_t roi_map[kWidth * kHeight] = {};
+    if (IsVP9(codec.iface)) {
+      roi.rows = (cfg.g_w + 7) >> 3;
+      roi.cols = (cfg.g_h + 7) >> 3;
+    } else {
+      roi.rows = (cfg.g_w + 15) >> 4;
+      roi.cols = (cfg.g_h + 15) >> 4;
+    }
+    EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+
+    roi.roi_map = roi_map;
+    // VP8 only. This value isn't range checked.
+    roi.static_threshold[1] = 1000;
+    roi.static_threshold[2] = UINT_MAX / 2 + 1;
+    roi.static_threshold[3] = UINT_MAX;
+
+    for (const auto delta : { -63, -1, 0, 1, 63 }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        roi.delta_lf[i] = delta;
+        // VP9 only.
+        roi.skip[i] ^= 1;
+        roi.ref_frame[i] = (roi.ref_frame[i] + 1) % 4;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+      }
+    }
+
+    vpx_codec_err_t expected_error;
+    for (const auto delta : { -64, 64, INT_MIN, INT_MAX }) {
+      expected_error = VPX_CODEC_INVALID_PARAM;
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        // The max segment count for VP8 is 4, the remainder of the entries are
+        // ignored.
+        if (i >= 4 && !IsVP9(codec.iface)) expected_error = VPX_CODEC_OK;
+
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_q[" << i << "]: " << delta;
+        roi.delta_q[i] = 0;
+
+        roi.delta_lf[i] = delta;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_lf[" << i << "]: " << delta;
+        roi.delta_lf[i] = 0;
+      }
+    }
+
+    // VP8 should ignore skip[] and ref_frame[] values.
+    expected_error =
+        IsVP9(codec.iface) ? VPX_CODEC_INVALID_PARAM : VPX_CODEC_OK;
+    for (const auto skip : { -2, 2, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.skip[i] = skip;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "skip[" << i << "]: " << skip;
+        roi.skip[i] = 0;
+      }
+    }
+
+    // VP9 allows negative values to be used to disable segmentation.
+    for (int ref_frame = -3; ref_frame < 0; ++ref_frame) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    for (const auto ref_frame : { 4, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+void InitCodec(vpx_codec_iface_t &iface, int width, int height,
+               vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) {
+  cfg->g_w = width;
+  cfg->g_h = height;
+  cfg->g_lag_in_frames = 0;
+  cfg->g_pass = VPX_RC_ONE_PASS;
+  ASSERT_EQ(vpx_codec_enc_init(enc, &iface, cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control_(enc, VP8E_SET_CPUUSED, 2), VPX_CODEC_OK);
+}
+
+// Encodes 1 frame of size |cfg.g_w| x |cfg.g_h| setting |enc|'s configuration
+// to |cfg|.
+void EncodeWithConfig(const vpx_codec_enc_cfg_t &cfg, vpx_codec_ctx_t *enc) {
+  libvpx_test::DummyVideoSource video;
+  video.SetSize(cfg.g_w, cfg.g_h);
+  video.Begin();
+  EXPECT_EQ(vpx_codec_enc_config_set(enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(enc);
+
+  EXPECT_EQ(vpx_codec_encode(enc, video.img(), video.pts(), video.duration(),
+                             /*flags=*/0, VPX_DL_GOOD_QUALITY),
+            VPX_CODEC_OK)
+      << vpx_codec_error_detail(enc);
+}
+
+TEST(EncodeAPI, ConfigChangeThreadCount) {
+  constexpr int kWidth = 1920;
+  constexpr int kHeight = 1080;
+
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
+      vpx_codec_enc_cfg_t cfg = {};
+      struct Encoder {
+        ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+        vpx_codec_ctx_t ctx = {};
+      } enc;
+
+      ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+      EXPECT_NO_FATAL_FAILURE(
+          InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg));
+      if (IsVP9(iface)) {
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6),
+                  VPX_CODEC_OK);
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i),
+                  VPX_CODEC_OK);
+      }
+
+      for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {
+        cfg.g_threads = threads;
+        EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+            << "iteration: " << i << " threads: " << threads;
+      }
+    }
+  }
+}
+
+TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
+  constexpr int kInitWidth = 1024;
+  constexpr int kInitHeight = 1024;
+
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
+      vpx_codec_enc_cfg_t cfg = {};
+      struct Encoder {
+        ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+        vpx_codec_ctx_t ctx = {};
+      } enc;
+
+      ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+      // Start in threaded mode to ensure resolution and thread related
+      // allocations are updated correctly across changes in resolution and
+      // thread counts. See https://crbug.com/1486441.
+      cfg.g_threads = 4;
+      EXPECT_NO_FATAL_FAILURE(
+          InitCodec(*iface, kInitWidth, kInitHeight, &enc.ctx, &cfg));
+      if (IsVP9(iface)) {
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6),
+                  VPX_CODEC_OK);
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i),
+                  VPX_CODEC_OK);
+      }
+
+      cfg.g_w = 1000;
+      cfg.g_h = 608;
+      EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK)
+          << vpx_codec_error_detail(&enc.ctx);
+
+      cfg.g_w = 1000;
+      cfg.g_h = 720;
+
+      for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {
+        cfg.g_threads = threads;
+        EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+            << "iteration: " << i << " threads: " << threads;
+      }
+    }
+  }
+}
+
+#if CONFIG_VP9_ENCODER
+// Frame size needed to trigger the overflow exceeds the max buffer allowed on
+// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY
+#if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
+TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
+  constexpr int kWidth = 12383;
+  constexpr int kHeight = 8192;
+  constexpr auto *iface = &vpx_codec_vp9_cx_algo;
+  SCOPED_TRACE(vpx_codec_iface_name(iface));
+  vpx_codec_enc_cfg_t cfg = {};
+  struct Encoder {
+    ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+    vpx_codec_ctx_t ctx = {};
+  } enc;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  // The following setting will cause avg_frame_bandwidth in rate control to be
+  // larger than INT_MAX
+  cfg.rc_target_bitrate = INT_MAX;
+  // Framerate 0.1 (equivalent to timebase 10) is the smallest framerate allowed
+  // by libvpx
+  cfg.g_timebase.den = 1;
+  cfg.g_timebase.num = 10;
+  EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg))
+      << "target bitrate: " << cfg.rc_target_bitrate << " framerate: "
+      << static_cast<double>(cfg.g_timebase.den) / cfg.g_timebase.num;
+}
+#endif  // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
+
+vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
+  vpx_image_t *image =
+      vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1);
+  if (!image) return image;
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+
+  return image;
+}
+
+// Emulates the WebCodecs VideoEncoder interface.
+class VP9Encoder {
+ public:
+  explicit VP9Encoder(int speed) : speed_(speed) {}
+  ~VP9Encoder();
+
+  void Configure(unsigned int threads, unsigned int width, unsigned int height,
+                 vpx_rc_mode end_usage, vpx_enc_deadline_t deadline);
+  void Encode(bool key_frame);
+
+ private:
+  const int speed_;
+  bool initialized_ = false;
+  vpx_codec_enc_cfg_t cfg_;
+  vpx_codec_ctx_t enc_;
+  int frame_index_ = 0;
+  vpx_enc_deadline_t deadline_ = 0;
+};
+
+VP9Encoder::~VP9Encoder() {
+  if (initialized_) {
+    EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK);
+  }
+}
+
+void VP9Encoder::Configure(unsigned int threads, unsigned int width,
+                           unsigned int height, vpx_rc_mode end_usage,
+                           vpx_enc_deadline_t deadline) {
+  deadline_ = deadline;
+
+  if (!initialized_) {
+    vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
+              VPX_CODEC_OK);
+    cfg_.g_threads = threads;
+    cfg_.g_w = width;
+    cfg_.g_h = height;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 1000 * 1000;  // microseconds
+    cfg_.g_pass = VPX_RC_ONE_PASS;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = end_usage;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 58;
+    ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
+    initialized_ = true;
+    return;
+  }
+
+  cfg_.g_threads = threads;
+  cfg_.g_w = width;
+  cfg_.g_h = height;
+  cfg_.rc_end_usage = end_usage;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc_);
+}
+
+void VP9Encoder::Encode(bool key_frame) {
+  const vpx_codec_cx_pkt_t *pkt;
+  vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h);
+  ASSERT_NE(image, nullptr);
+  const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_),
+      VPX_CODEC_OK);
+  frame_index_++;
+  vpx_codec_iter_t iter = nullptr;
+  while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+  }
+  vpx_img_free(image);
+}
+
+// This is a test case from clusterfuzz.
+TEST(EncodeAPI, PrevMiCheckNullptr) {
+  VP9Encoder encoder(0);
+  encoder.Configure(0, 1554, 644, VPX_VBR, VPX_DL_REALTIME);
+
+  // First step: encode, without forcing KF.
+  encoder.Encode(false);
+  // Second step: change config
+  encoder.Configure(0, 1131, 644, VPX_CBR, VPX_DL_GOOD_QUALITY);
+  // Third step: encode, without forcing KF
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/310477034.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, MultipleChangeConfigResize) {
+  VP9Encoder encoder(3);
+
+  // Set initial config.
+  encoder.Configure(3, 41, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 31, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Change config again.
+  encoder.Configure(0, 17, 1, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Encode 3rd frame with same config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/310663186.
+// Encode set of frames while varying the deadline on the fly from
+// good to realtime to best and back to realtime.
+TEST(EncodeAPI, DynamicDeadlineChange) {
+  // Use realtime speed: 5 to 9.
+  VP9Encoder encoder(5);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(true);
+
+  // Encode 2nd frame, delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set key frame.
+  encoder.Encode(true);
+
+  // Encode 4th frame with same config, delta frame.
+  encoder.Encode(false);
+
+  // Encode 5th frame with same config, key frame.
+  encoder.Encode(true);
+
+  // Change config: change deadline to BEST.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY);
+
+  // Encode 6th frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 7th frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Encode 8th frame with new config, set key frame.
+  encoder.Encode(true);
+
+  // Encode 9th frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
+TEST(EncodeAPI, Buganizer310340241) {
+  VP9Encoder encoder(-6);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(true);
+
+  // Encode 2nd frame, delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set key frame.
+  encoder.Encode(true);
+}
+
+// This is a test case from clusterfuzz: based on b/312517065.
+TEST(EncodeAPI, Buganizer312517065) {
+  VP9Encoder encoder(4);
+  encoder.Configure(0, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Configure(10, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY);
+  encoder.Encode(false);
+  encoder.Configure(6, 327, 269, VPX_VBR, VPX_DL_GOOD_QUALITY);
+  encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/311489136.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311489136) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(3, 1678, 202, VPX_CBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config again.
+  encoder.Configure(8, 1037, 476, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config again.
+  encoder.Configure(0, 580, 620, VPX_CBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 4th frame with same config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/312656387.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer312656387) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(16, 1, 1024, VPX_CBR, VPX_DL_REALTIME);
+
+  // Change config.
+  encoder.Configure(15, 1, 1024, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config again.
+  encoder.Configure(14, 1, 595, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(true);
+
+  // Change config again.
+  encoder.Configure(2, 1, 1024, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 3rd frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/310329177.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer310329177) {
+  VP9Encoder encoder(6);
+
+  // Set initial config.
+  encoder.Configure(10, 41, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/311394513.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311394513) {
+  VP9Encoder encoder(-7);
+
+  // Set initial config.
+  encoder.Configure(0, 5, 9, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(5, 2, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(true);
+}
+
+TEST(EncodeAPI, Buganizer311985118) {
+  VP9Encoder encoder(0);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(false);
+
+  // Change config: change threads and width.
+  encoder.Configure(0, 1574, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Change config: change threads, width and height.
+  encoder.Configure(16, 837, 432, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/314857577.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer314857577) {
+  VP9Encoder encoder(4);
+
+  // Set initial config.
+  encoder.Configure(12, 1060, 437, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(16, 1060, 1, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(false);
+
+  // Encode 3rd frame with new config.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(15, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 4th frame with new config.
+  encoder.Encode(true);
+
+  // Encode 5th frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(5, 327, 269, VPX_VBR, VPX_DL_REALTIME);
+
+  // Change config.
+  encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 6th frame with new config.
+  encoder.Encode(false);
+
+  // Encode 7th frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(4, 1060, 437, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 8th frame with new config.
+  encoder.Encode(false);
+}
+
+TEST(EncodeAPI, Buganizer312875957PredBufferStride) {
+  VP9Encoder encoder(-1);
+
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Encode(false);
+  encoder.Configure(0, 456, 486, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Configure(0, 1678, 620, VPX_CBR, 1000000);
+  encoder.Encode(false);
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/311294795
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311294795) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(16, 632, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 1678, 342, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 3rd frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(0, 1574, 618, VPX_VBR, VPX_DL_REALTIME);
+  // Encode more frames with new config.
+  encoder.Encode(false);
+  encoder.Encode(false);
+}
+#endif  // CONFIG_VP9_ENCODER
+
 }  // namespace
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 142a55952b..171ff8eeca 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -61,9 +61,9 @@ class VP9EncodePerfTest
       : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
 
-  virtual ~VP9EncodePerfTest() {}
+  ~VP9EncodePerfTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -82,8 +82,8 @@ class VP9EncodePerfTest
     cfg_.g_threads = threads_;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       const int log2_tile_columns = 3;
       encoder->Control(VP8E_SET_CPUUSED, speed_);
@@ -93,19 +93,19 @@ class VP9EncodePerfTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     min_psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) {
       min_psnr_ = pkt->data.psnr.psnr[0];
     }
   }
 
   // for performance reasons don't decode
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
   double min_psnr() const { return min_psnr_; }
 
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 8fdbdb62ae..d3feeee34d 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -52,7 +52,8 @@ void Encoder::InitEncoder(VideoSource *video) {
   }
 }
 
-void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
+void Encoder::EncodeFrame(VideoSource *video,
+                          const vpx_enc_frame_flags_t frame_flags) {
   if (video->img()) {
     EncodeFrameInternal(*video, frame_flags);
   } else {
@@ -70,7 +71,7 @@ void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
 }
 
 void Encoder::EncodeFrameInternal(const VideoSource &video,
-                                  const unsigned long frame_flags) {
+                                  const vpx_enc_frame_flags_t frame_flags) {
   vpx_codec_err_t res;
   const vpx_image_t *img = video.img();
 
@@ -91,7 +92,7 @@ void Encoder::EncodeFrameInternal(const VideoSource &video,
 
 void Encoder::Flush() {
   const vpx_codec_err_t res =
-      vpx_codec_encode(&encoder_, NULL, 0, 0, 0, deadline_);
+      vpx_codec_encode(&encoder_, nullptr, 0, 0, 0, deadline_);
   if (!encoder_.priv)
     ASSERT_EQ(VPX_CODEC_ERROR, res) << EncoderError();
   else
@@ -169,7 +170,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
   ASSERT_TRUE(passes_ == 1 || passes_ == 2);
   for (unsigned int pass = 0; pass < passes_; pass++) {
-    last_pts_ = 0;
+    vpx_codec_pts_t last_pts = 0;
 
     if (passes_ == 1) {
       cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -182,7 +183,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
     BeginPassHook(pass);
     std::unique_ptr<Encoder> encoder(
         codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_));
-    ASSERT_TRUE(encoder.get() != NULL);
+    ASSERT_NE(encoder.get(), nullptr);
 
     ASSERT_NO_FATAL_FAILURE(video->Begin());
     encoder->InitEncoder(video);
@@ -198,7 +199,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
         codec_->CreateDecoder(dec_cfg, dec_init_flags));
     bool again;
     for (again = true; again; video->Next()) {
-      again = (video->img() != NULL);
+      again = (video->img() != nullptr);
 
       PreEncodeFrameHook(video);
       PreEncodeFrameHook(video, encoder.get());
@@ -216,7 +217,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
         switch (pkt->kind) {
           case VPX_CODEC_CX_FRAME_PKT:
             has_cxdata = true;
-            if (decoder.get() != NULL && DoDecode()) {
+            if (decoder != nullptr && DoDecode()) {
               PreDecodeFrameHook(video, decoder.get());
               vpx_codec_err_t res_dec = decoder->DecodeFrame(
                   (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
@@ -225,8 +226,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
               has_dxdata = true;
             }
-            ASSERT_GE(pkt->data.frame.pts, last_pts_);
-            last_pts_ = pkt->data.frame.pts;
+            ASSERT_GE(pkt->data.frame.pts, last_pts);
+            last_pts = pkt->data.frame.pts;
             FramePktHook(pkt);
             break;
 
@@ -240,7 +241,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
       // Flush the decoder when there are no more fragments.
       if ((init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
-        const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+        const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0);
         if (!HandleDecodeResult(res_dec, *video, decoder.get())) break;
       }
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 3edba4b926..7dd80d6664 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -19,7 +19,7 @@
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
 
 namespace libvpx_test {
 
@@ -49,7 +49,7 @@ enum TestMode {
 class CxDataIterator {
  public:
   explicit CxDataIterator(vpx_codec_ctx_t *encoder)
-      : encoder_(encoder), iter_(NULL) {}
+      : encoder_(encoder), iter_(nullptr) {}
 
   const vpx_codec_cx_pkt_t *Next() {
     return vpx_codec_get_cx_data(encoder_, &iter_);
@@ -86,7 +86,7 @@ class TwopassStatsStore {
 // level of abstraction will be fleshed out as more tests are written.
 class Encoder {
  public:
-  Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
           const unsigned long init_flags, TwopassStatsStore *stats)
       : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) {
     memset(&encoder_, 0, sizeof(encoder_));
@@ -103,7 +103,7 @@ class Encoder {
   }
   // This is a thin wrapper around vpx_codec_encode(), so refer to
   // vpx_encoder.h for its semantics.
-  void EncodeFrame(VideoSource *video, const unsigned long frame_flags);
+  void EncodeFrame(VideoSource *video, vpx_enc_frame_flags_t frame_flags);
 
   // Convenience wrapper for EncodeFrame()
   void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); }
@@ -148,6 +148,18 @@ class Encoder {
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 
+#if CONFIG_VP9_ENCODER
+  void Control(int ctrl_id, vpx_rc_funcs_t *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, VpxTplGopStats *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+#endif  // CONFIG_VP9_ENCODER
+
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
   void Control(int ctrl_id, vpx_active_map_t *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
@@ -165,7 +177,7 @@ class Encoder {
     cfg_ = *cfg;
   }
 
-  void set_deadline(unsigned long deadline) { deadline_ = deadline; }
+  void set_deadline(vpx_enc_deadline_t deadline) { deadline_ = deadline; }
 
  protected:
   virtual vpx_codec_iface_t *CodecInterface() const = 0;
@@ -177,14 +189,14 @@ class Encoder {
 
   // Encode an image
   void EncodeFrameInternal(const VideoSource &video,
-                           const unsigned long frame_flags);
+                           vpx_enc_frame_flags_t frame_flags);
 
   // Flush the encoder on EOS
   void Flush();
 
   vpx_codec_ctx_t encoder_;
   vpx_codec_enc_cfg_t cfg_;
-  unsigned long deadline_;
+  vpx_enc_deadline_t deadline_;
   unsigned long init_flags_;
   TwopassStatsStore *stats_;
 };
@@ -199,8 +211,7 @@ class Encoder {
 class EncoderTest {
  protected:
   explicit EncoderTest(const CodecFactory *codec)
-      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
-        last_pts_(0) {
+      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0) {
     // Default to 1 thread.
     cfg_.g_threads = 1;
   }
@@ -253,7 +264,7 @@ class EncoderTest {
 
   const CodecFactory *codec_;
   // Hook to determine whether to decode frame after encoding
-  virtual bool DoDecode() const { return 1; }
+  virtual bool DoDecode() const { return true; }
 
   // Hook to handle encode/decode mismatch
   virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2);
@@ -280,11 +291,10 @@ class EncoderTest {
   vpx_codec_enc_cfg_t cfg_;
   vpx_codec_dec_cfg_t dec_cfg_;
   unsigned int passes_;
-  unsigned long deadline_;
+  vpx_enc_deadline_t deadline_;
   TwopassStatsStore stats_;
   unsigned long init_flags_;
-  unsigned long frame_flags_;
-  vpx_codec_pts_t last_pts_;
+  vpx_enc_frame_flags_t frame_flags_;
 };
 
 }  // namespace libvpx_test
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 45a327ec2f..6b019b2bfb 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -30,7 +30,7 @@ class ErrorResilienceTestLarge
     Reset();
   }
 
-  virtual ~ErrorResilienceTestLarge() {}
+  ~ErrorResilienceTestLarge() override = default;
 
   void Reset() {
     error_nframes_ = 0;
@@ -38,19 +38,19 @@ class ErrorResilienceTestLarge
     pattern_switch_ = 0;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = 0.0;
     nframes_ = 0;
     mismatch_psnr_ = 0.0;
     mismatch_nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
@@ -90,7 +90,7 @@ class ErrorResilienceTestLarge
     return frame_flags;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video) override {
     frame_flags_ &=
         ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF);
     // For temporal layer case.
@@ -129,21 +129,21 @@ class ErrorResilienceTestLarge
     return 0.0;
   }
 
-  virtual bool DoDecode() const {
+  bool DoDecode() const override {
     if (error_nframes_ > 0 &&
         (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
       for (unsigned int i = 0; i < error_nframes_; ++i) {
         if (error_frames_[i] == nframes_ - 1) {
           std::cout << "             Skipping decoding frame: "
                     << error_frames_[i] << "\n";
-          return 0;
+          return false;
         }
       }
     }
-    return 1;
+    return true;
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
@@ -381,7 +381,7 @@ class ErrorResilienceTestLargeCodecControls
     Reset();
   }
 
-  virtual ~ErrorResilienceTestLargeCodecControls() {}
+  ~ErrorResilienceTestLargeCodecControls() override = default;
 
   void Reset() {
     last_pts_ = 0;
@@ -393,7 +393,7 @@ class ErrorResilienceTestLargeCodecControls
     duration_ = 0.0;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
@@ -460,8 +460,8 @@ class ErrorResilienceTestLargeCodecControls
     return layer_id;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (cfg_.ts_number_layers > 1) {
       int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
       int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
@@ -476,7 +476,7 @@ class ErrorResilienceTestLargeCodecControls
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
     if (duration > 1) {
@@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  void EndPassHook() override {
     duration_ = (last_pts_ + 1) * timebase_;
     if (cfg_.ts_number_layers > 1) {
       for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index 25dd30011b..7b9a836fbc 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -36,7 +36,7 @@ struct ExternalFrameBuffer {
 class ExternalFrameBufferList {
  public:
   ExternalFrameBufferList()
-      : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {}
+      : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(nullptr) {}
 
   virtual ~ExternalFrameBufferList() {
     for (int i = 0; i < num_buffers_; ++i) {
@@ -51,7 +51,7 @@ class ExternalFrameBufferList {
 
     num_buffers_ = num_buffers;
     ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
-    EXPECT_TRUE(ext_fb_list_ != NULL);
+    EXPECT_NE(ext_fb_list_, nullptr);
     memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
     return true;
   }
@@ -61,7 +61,7 @@ class ExternalFrameBufferList {
   // frame buffer is in use by libvpx. Finally sets |fb| to point to the
   // external frame buffer. Returns < 0 on an error.
   int GetFreeFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
-    EXPECT_TRUE(fb != NULL);
+    EXPECT_NE(fb, nullptr);
     const int idx = FindFreeBufferIndex();
     if (idx == num_buffers_) return -1;
 
@@ -81,13 +81,13 @@ class ExternalFrameBufferList {
   // Test function that will not allocate any data for the frame buffer.
   // Returns < 0 on an error.
   int GetZeroFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
-    EXPECT_TRUE(fb != NULL);
+    EXPECT_NE(fb, nullptr);
     const int idx = FindFreeBufferIndex();
     if (idx == num_buffers_) return -1;
 
     if (ext_fb_list_[idx].size < min_size) {
       delete[] ext_fb_list_[idx].data;
-      ext_fb_list_[idx].data = NULL;
+      ext_fb_list_[idx].data = nullptr;
       ext_fb_list_[idx].size = min_size;
     }
 
@@ -98,14 +98,14 @@ class ExternalFrameBufferList {
   // Marks the external frame buffer that |fb| is pointing to as free.
   // Returns < 0 on an error.
   int ReturnFrameBuffer(vpx_codec_frame_buffer_t *fb) {
-    if (fb == NULL) {
-      EXPECT_TRUE(fb != NULL);
+    if (fb == nullptr) {
+      EXPECT_NE(fb, nullptr);
       return -1;
     }
     ExternalFrameBuffer *const ext_fb =
         reinterpret_cast<ExternalFrameBuffer *>(fb->priv);
-    if (ext_fb == NULL) {
-      EXPECT_TRUE(ext_fb != NULL);
+    if (ext_fb == nullptr) {
+      EXPECT_NE(ext_fb, nullptr);
       return -1;
     }
     EXPECT_EQ(1, ext_fb->in_use);
@@ -117,7 +117,7 @@ class ExternalFrameBufferList {
   // Checks that the vpx_image_t data is contained within the external frame
   // buffer private data passed back in the vpx_image_t.
   void CheckImageFrameBuffer(const vpx_image_t *img) {
-    if (img->fb_priv != NULL) {
+    if (img->fb_priv != nullptr) {
       const struct ExternalFrameBuffer *const ext_fb =
           reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
 
@@ -143,7 +143,7 @@ class ExternalFrameBufferList {
   // Sets |fb| to an external frame buffer. idx is the index into the frame
   // buffer list.
   void SetFrameBuffer(int idx, vpx_codec_frame_buffer_t *fb) {
-    ASSERT_TRUE(fb != NULL);
+    ASSERT_NE(fb, nullptr);
     fb->data = ext_fb_list_[idx].data;
     fb->size = ext_fb_list_[idx].size;
     ASSERT_EQ(0, ext_fb_list_[idx].in_use);
@@ -208,15 +208,14 @@ class ExternalFrameBufferMD5Test
  protected:
   ExternalFrameBufferMD5Test()
       : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)),
-        md5_file_(NULL), num_buffers_(0) {}
+        md5_file_(nullptr), num_buffers_(0) {}
 
-  virtual ~ExternalFrameBufferMD5Test() {
-    if (md5_file_ != NULL) fclose(md5_file_);
+  ~ExternalFrameBufferMD5Test() override {
+    if (md5_file_ != nullptr) fclose(md5_file_);
   }
 
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (num_buffers_ > 0 && video.frame_number() == 0) {
       // Have libvpx use frame buffers we create.
       ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
@@ -228,13 +227,13 @@ class ExternalFrameBufferMD5Test
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL)
+    ASSERT_NE(md5_file_, nullptr)
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
-    ASSERT_TRUE(md5_file_ != NULL);
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
+    ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
 
@@ -286,24 +285,25 @@ const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm";
 // Class for testing passing in external frame buffers to libvpx.
 class ExternalFrameBufferTest : public ::testing::Test {
  protected:
-  ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {}
+  ExternalFrameBufferTest()
+      : video_(nullptr), decoder_(nullptr), num_buffers_(0) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
-    ASSERT_TRUE(video_ != NULL);
+    ASSERT_NE(video_, nullptr);
     video_->Init();
     video_->Begin();
 
     vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
-    ASSERT_TRUE(decoder_ != NULL);
+    ASSERT_NE(decoder_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete decoder_;
-    decoder_ = NULL;
+    decoder_ = nullptr;
     delete video_;
-    video_ = NULL;
+    video_ = nullptr;
   }
 
   // Passes the external frame buffer information to libvpx.
@@ -327,7 +327,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
   }
 
   vpx_codec_err_t DecodeRemainingFrames() {
-    for (; video_->cxdata() != NULL; video_->Next()) {
+    for (; video_->cxdata() != nullptr; video_->Next()) {
       const vpx_codec_err_t res =
           decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
       if (res != VPX_CODEC_OK) return res;
@@ -338,10 +338,10 @@ class ExternalFrameBufferTest : public ::testing::Test {
 
   void CheckDecodedFrames() {
     libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
-    const vpx_image_t *img = NULL;
+    const vpx_image_t *img = nullptr;
 
     // Get decompressed data
-    while ((img = dec_iter.Next()) != NULL) {
+    while ((img = dec_iter.Next()) != nullptr) {
       fb_list_.CheckImageFrameBuffer(img);
     }
   }
@@ -354,15 +354,15 @@ class ExternalFrameBufferTest : public ::testing::Test {
 
 class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile);
-    ASSERT_TRUE(video_ != NULL);
+    ASSERT_NE(video_, nullptr);
     video_->Init();
     video_->Begin();
 
     vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
-    ASSERT_TRUE(decoder_ != NULL);
+    ASSERT_NE(decoder_, nullptr);
   }
 
   virtual void CheckFrameBufferRelease() {
@@ -405,7 +405,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
     return;
 #endif
   }
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
 
   // Construct md5 file name.
@@ -482,13 +482,14 @@ TEST_F(ExternalFrameBufferTest, NullGetFunction) {
   const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
   ASSERT_EQ(
       VPX_CODEC_INVALID_PARAM,
-      SetFrameBufferFunctions(num_buffers, NULL, release_vp9_frame_buffer));
+      SetFrameBufferFunctions(num_buffers, nullptr, release_vp9_frame_buffer));
 }
 
 TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
   const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
-  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
-            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, NULL));
+  ASSERT_EQ(
+      VPX_CODEC_INVALID_PARAM,
+      SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, nullptr));
 }
 
 TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index fd9fdff611..3cdf909d46 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -132,9 +132,18 @@ void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck().
+// See:
+// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", off)
+#endif
 class FwdTrans8x8TestBase {
  public:
-  virtual ~FwdTrans8x8TestBase() {}
+  virtual ~FwdTrans8x8TestBase() = default;
 
  protected:
   virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
@@ -170,7 +179,7 @@ class FwdTrans8x8TestBase {
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = kSignBiasMaxDiff255;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+      ASSERT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-255, 255] at index " << j
@@ -201,7 +210,7 @@ class FwdTrans8x8TestBase {
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = kSignBiasMaxDiff15;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+      ASSERT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-15, 15] at index " << j
@@ -275,11 +284,11 @@ class FwdTrans8x8TestBase {
       }
     }
 
-    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+    ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error)
         << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
         << " roundtrip error > 1";
 
-    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
+    ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
         << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
         << "error > 1/5 per block";
   }
@@ -360,17 +369,17 @@ class FwdTrans8x8TestBase {
         total_coeff_error += abs(coeff_diff);
       }
 
-      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+      ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
-          << "an individual roundtrip error > 1";
+          << " an individual roundtrip error > 1";
 
-      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
+      ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
           << " roundtrip error > 1/5 per block";
 
-      EXPECT_EQ(0, total_coeff_error)
+      ASSERT_EQ(0, total_coeff_error)
           << "Error: Extremal 8x8 FDCT/FHT has"
-          << "overflow issues in the intermediate steps > 1";
+          << " overflow issues in the intermediate steps > 1";
     }
   }
 
@@ -426,7 +435,7 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
+        ASSERT_GE(1u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
@@ -456,7 +465,7 @@ class FwdTrans8x8TestBase {
       for (int j = 0; j < kNumCoeffs; ++j) {
         const int32_t diff = coeff[j] - coeff_r[j];
         const uint32_t error = diff * diff;
-        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
+        ASSERT_GE(9u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 DCT has error " << error << " at index " << j;
       }
     }
@@ -512,7 +521,7 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - ref[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error)
+        ASSERT_EQ(0u, error)
             << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
@@ -523,13 +532,16 @@ class FwdTrans8x8TestBase {
   vpx_bit_depth_t bit_depth_;
   int mask_;
 };
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", on)
+#endif
 
 class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Dct8x8Param> {
  public:
-  virtual ~FwdTrans8x8DCT() {}
+  ~FwdTrans8x8DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -539,13 +551,13 @@ class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -566,9 +578,9 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 class FwdTrans8x8HT : public FwdTrans8x8TestBase,
                       public ::testing::TestWithParam<Ht8x8Param> {
  public:
-  virtual ~FwdTrans8x8HT() {}
+  ~FwdTrans8x8HT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -578,13 +590,13 @@ class FwdTrans8x8HT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -598,12 +610,13 @@ TEST_P(FwdTrans8x8HT, RoundTripErrorCheck) { RunRoundTripErrorCheck(); }
 
 TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); }
 
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 class InvTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Idct8x8Param> {
  public:
-  virtual ~InvTrans8x8DCT() {}
+  ~InvTrans8x8DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     thresh_ = GET_PARAM(2);
@@ -612,22 +625,25 @@ class InvTrans8x8DCT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
-  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {}
+  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/,
+                  int /*stride*/) override {}
 
   IdctFunc ref_txfm_;
   IdctFunc inv_txfm_;
   int thresh_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans8x8DCT);
 
 TEST_P(InvTrans8x8DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 using std::make_tuple;
 
@@ -765,4 +781,11 @@ INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT,
                                                       &vpx_idct8x8_64_add_vsx,
                                                       0, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_lsx,
+                                                      &vpx_idct8x8_64_add_c, 0,
+                                                      VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 9397af2b2d..eea5647a78 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -18,8 +18,7 @@ namespace {
 
 class EncoderWithExpectedError : public ::libvpx_test::Encoder {
  public:
-  EncoderWithExpectedError(vpx_codec_enc_cfg_t cfg,
-                           unsigned long deadline,          // NOLINT
+  EncoderWithExpectedError(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
                            const unsigned long init_flags,  // NOLINT
                            ::libvpx_test::TwopassStatsStore *stats)
       : ::libvpx_test::Encoder(cfg, deadline, init_flags, stats) {}
@@ -65,11 +64,11 @@ class EncoderWithExpectedError : public ::libvpx_test::Encoder {
     ASSERT_EQ(expected_err, res) << EncoderError();
   }
 
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -79,22 +78,22 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
  protected:
   VP9FrameSizeTestsLarge()
       : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {}
-  virtual ~VP9FrameSizeTestsLarge() {}
+  ~VP9FrameSizeTestsLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
@@ -111,7 +110,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
 
     ASSERT_TRUE(passes_ == 1 || passes_ == 2);
     for (unsigned int pass = 0; pass < passes_; pass++) {
-      last_pts_ = 0;
+      vpx_codec_pts_t last_pts = 0;
 
       if (passes_ == 1) {
         cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -130,7 +129,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
       encoder->InitEncoder(video);
       ASSERT_FALSE(::testing::Test::HasFatalFailure());
       for (bool again = true; again; video->Next()) {
-        again = (video->img() != NULL);
+        again = (video->img() != nullptr);
 
         PreEncodeFrameHook(video, encoder.get());
         encoder->EncodeFrame(video, frame_flags_, expected_err);
@@ -144,8 +143,8 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
           again = true;
           switch (pkt->kind) {
             case VPX_CODEC_CX_FRAME_PKT:
-              ASSERT_GE(pkt->data.frame.pts, last_pts_);
-              last_pts_ = pkt->data.frame.pts;
+              ASSERT_GE(pkt->data.frame.pts, last_pts);
+              last_pts = pkt->data.frame.pts;
               FramePktHook(pkt);
               break;
 
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index dab945a561..b22bae87cc 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -130,13 +130,19 @@ std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
 
 class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     h_func_ = GetParam().func;
     bwh_ = GetParam().block_size;
     block_size_ = bwh_ * bwh_;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
+  // The Rand() function generates values in the range [-((1 << BitDepth) - 1),
+  // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform
+  // is the residual pixel, which is defined as 'source pixel - predicted
+  // pixel'. Source pixel and predicted pixel take values in the range
+  // [0, (1 << BitDepth) - 1] and thus the residual pixel ranges from
+  // -((1 << BitDepth) - 1) to ((1 << BitDepth) - 1).
   virtual int16_t Rand() = 0;
 
   void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
@@ -170,6 +176,31 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
     EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
   }
 
+  void ExtremeValuesTest() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(b, 0, sizeof(b));
+
+    tran_low_t b_ref[kMaxBlockSize];
+    memset(b_ref, 0, sizeof(b_ref));
+
+    for (int i = 0; i < 2; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      const int sign = (i == 0) ? 1 : -1;
+      for (int j = 0; j < kMaxBlockSize; ++j)
+        input_extreme_block[j] = sign * 255;
+
+      ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_);
+      ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b));
+
+      // The order of the output is not important. Sort before checking.
+      std::sort(b, b + block_size_);
+      std::sort(b_ref, b_ref + block_size_);
+      EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+    }
+  }
+
   void VaryStride() {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
@@ -220,11 +251,18 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
 
 class HadamardLowbdTest : public HadamardTestBase {
  protected:
-  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+  // Use values between -255 (0xFF01) and 255 (0x00FF)
+  int16_t Rand() override {
+    int16_t src = rnd_.Rand8();
+    int16_t pred = rnd_.Rand8();
+    return src - pred;
+  }
 };
 
 TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
 
+TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); }
+
 TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
 
 TEST_P(HadamardLowbdTest, DISABLED_Speed) {
@@ -264,7 +302,8 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, HadamardLowbdTest,
     ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8),
-                      HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16)));
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_neon, 32)));
 #endif  // HAVE_NEON
 
 // TODO(jingning): Remove highbitdepth flag when the SIMD functions are
@@ -285,10 +324,22 @@ INSTANTIATE_TEST_SUITE_P(
                       HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16)));
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_lsx, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_lsx, 16)));
+#endif  // HAVE_LSX
+
 #if CONFIG_VP9_HIGHBITDEPTH
 class HadamardHighbdTest : public HadamardTestBase {
  protected:
-  virtual int16_t Rand() { return rnd_.Rand13Signed(); }
+  // Use values between -4095 (0xF001) and 4095 (0x0FFF)
+  int16_t Rand() override {
+    int16_t src = rnd_.Rand12();
+    int16_t pred = rnd_.Rand12();
+    return src - pred;
+  }
 };
 
 TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
@@ -316,5 +367,14 @@ INSTANTIATE_TEST_SUITE_P(
                                            32)));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardHighbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, 16),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_neon,
+                                           32)));
+#endif
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/idct_test.cc b/test/idct_test.cc
index 50d5ff433d..279e58e2aa 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -27,21 +27,21 @@ using libvpx_test::Buffer;
 
 class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     UUT = GetParam();
 
     input = new Buffer<int16_t>(4, 4, 0);
-    ASSERT_TRUE(input != NULL);
+    ASSERT_NE(input, nullptr);
     ASSERT_TRUE(input->Init());
     predict = new Buffer<uint8_t>(4, 4, 3);
-    ASSERT_TRUE(predict != NULL);
+    ASSERT_NE(predict, nullptr);
     ASSERT_TRUE(predict->Init());
     output = new Buffer<uint8_t>(4, 4, 3);
-    ASSERT_TRUE(output != NULL);
+    ASSERT_NE(output, nullptr);
     ASSERT_TRUE(output->Init());
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete input;
     delete predict;
     delete output;
@@ -72,7 +72,7 @@ TEST_P(IDCTTest, TestAllZeros) {
 
 TEST_P(IDCTTest, TestAllOnes) {
   input->Set(0);
-  ASSERT_TRUE(input->TopLeftPixel() != NULL);
+  ASSERT_NE(input->TopLeftPixel(), nullptr);
   // When the first element is '4' it will fill the output buffer with '1'.
   input->TopLeftPixel()[0] = 4;
   predict->Set(0);
@@ -90,7 +90,7 @@ TEST_P(IDCTTest, TestAddOne) {
   // Set the transform output to '1' and make sure it gets added to the
   // prediction buffer.
   input->Set(0);
-  ASSERT_TRUE(input->TopLeftPixel() != NULL);
+  ASSERT_NE(input->TopLeftPixel(), nullptr);
   input->TopLeftPixel()[0] = 4;
   output->Set(0);
 
diff --git a/test/init_vpx_test.cc b/test/init_vpx_test.cc
new file mode 100644
index 0000000000..f66f00b5c1
--- /dev/null
+++ b/test/init_vpx_test.cc
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/init_vpx_test.h"
+
+#include "./vpx_config.h"
+
+#if !CONFIG_SHARED
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#if VPX_ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
+extern "C" {
+#if CONFIG_VP8
+extern void vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+extern void vp9_rtcd();
+#endif  // CONFIG_VP9
+extern void vpx_dsp_rtcd();
+extern void vpx_scale_rtcd();
+}
+
+#if VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64
+static void append_negative_gtest_filter(const char *str) {
+  std::string filter = GTEST_FLAG_GET(filter);
+  // Negative patterns begin with one '-' followed by a ':' separated list.
+  if (filter.find('-') == std::string::npos) filter += '-';
+  filter += str;
+  GTEST_FLAG_SET(filter, filter);
+}
+#endif  // VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64
+#endif  // !CONFIG_SHARED
+
+namespace libvpx_test {
+void init_vpx_test() {
+#if !CONFIG_SHARED
+#if VPX_ARCH_AARCH64
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON_DOTPROD)) {
+    append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*");
+  }
+  if (!(caps & HAS_NEON_I8MM)) {
+    append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*");
+  }
+  if (!(caps & HAS_SVE)) {
+    append_negative_gtest_filter(":SVE.*:SVE/*");
+  }
+#elif VPX_ARCH_ARM
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*");
+#endif  // VPX_ARCH_ARM
+
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+  const int simd_caps = x86_simd_caps();
+  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
+  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*");
+  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*");
+  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*");
+  if (!(simd_caps & HAS_SSSE3)) {
+    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
+  }
+  if (!(simd_caps & HAS_SSE4_1)) {
+    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
+  }
+  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
+  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
+  if (!(simd_caps & HAS_AVX512)) {
+    append_negative_gtest_filter(":AVX512.*:AVX512/*");
+  }
+#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
+
+  // Shared library builds don't support whitebox tests that exercise internal
+  // symbols.
+#if CONFIG_VP8
+  vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+  vp9_rtcd();
+#endif  // CONFIG_VP9
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+#endif  // !CONFIG_SHARED
+}
+}  // namespace libvpx_test
diff --git a/test/init_vpx_test.h b/test/init_vpx_test.h
new file mode 100644
index 0000000000..5e0dbb0e7e
--- /dev/null
+++ b/test/init_vpx_test.h
@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_INIT_VPX_TEST_H_
+#define TEST_INIT_VPX_TEST_H_
+
+namespace libvpx_test {
+void init_vpx_test();
+}
+
+#endif  // TEST_INIT_VPX_TEST_H_
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index a5054a54d6..c37dc0d486 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -38,23 +38,22 @@ std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
 class InvalidFileTest : public ::libvpx_test::DecoderTest,
                         public ::libvpx_test::CodecTestWithParam<DecodeParam> {
  protected:
-  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {}
 
-  virtual ~InvalidFileTest() {
-    if (res_file_ != NULL) fclose(res_file_);
+  ~InvalidFileTest() override {
+    if (res_file_ != nullptr) fclose(res_file_);
   }
 
   void OpenResFile(const std::string &res_file_name_) {
     res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
-    ASSERT_TRUE(res_file_ != NULL)
+    ASSERT_NE(res_file_, nullptr)
         << "Result file open failed. Filename: " << res_file_name_;
   }
 
-  virtual bool HandleDecodeResult(
-      const vpx_codec_err_t res_dec,
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
-    EXPECT_TRUE(res_file_ != NULL);
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
+    EXPECT_NE(res_file_, nullptr);
     int expected_res_dec;
 
     // Read integer result.
@@ -102,7 +101,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
       return;
 #endif
     }
-    ASSERT_TRUE(video.get() != NULL);
+    ASSERT_NE(video.get(), nullptr);
     video->Init();
 
     // Construct result file name. The file holds a list of expected integer
@@ -172,9 +171,9 @@ VP9_INSTANTIATE_TEST_SUITE(InvalidFileTest,
 class InvalidFileInvalidPeekTest : public InvalidFileTest {
  protected:
   InvalidFileInvalidPeekTest() : InvalidFileTest() {}
-  virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
-                                libvpx_test::CompressedVideoSource * /*video*/,
-                                const vpx_codec_err_t /*res_peek*/) {}
+  void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+                        libvpx_test::CompressedVideoSource * /*video*/,
+                        const vpx_codec_err_t /*res_peek*/) override {}
 };
 
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); }
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 22c05ecde9..3ccac62b51 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -29,25 +29,25 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 class IVFVideoSource : public CompressedVideoSource {
  public:
   explicit IVFVideoSource(const std::string &file_name)
-      : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL),
-        frame_sz_(0), frame_(0), end_of_file_(false) {}
+      : file_name_(file_name), input_file_(nullptr),
+        compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
+        end_of_file_(false) {}
 
-  virtual ~IVFVideoSource() {
+  ~IVFVideoSource() override {
     delete[] compressed_frame_buf_;
 
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Init() {
+  void Init() override {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_ != NULL)
-        << "Allocate frame buffer failed";
+    ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
   }
 
-  virtual void Begin() {
+  void Begin() override {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     // Read file header
@@ -62,13 +62,13 @@ class IVFVideoSource : public CompressedVideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
   void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     uint8_t frame_hdr[kIvfFrameHdrSize];
     // Check frame header and read a frame from input_file.
     if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) !=
@@ -86,11 +86,11 @@ class IVFVideoSource : public CompressedVideoSource {
     }
   }
 
-  virtual const uint8_t *cxdata() const {
-    return end_of_file_ ? NULL : compressed_frame_buf_;
+  const uint8_t *cxdata() const override {
+    return end_of_file_ ? nullptr : compressed_frame_buf_;
   }
-  virtual size_t frame_size() const { return frame_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  size_t frame_size() const override { return frame_sz_; }
+  unsigned int frame_number() const override { return frame_; }
 
  protected:
   std::string file_name_;
diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc
index a13dec9ce2..6a1c99cbe2 100644
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -8,12 +8,18 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <climits>
+#include <cstring>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "./vpx_config.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_image.h"
 
 namespace {
 
@@ -22,9 +28,9 @@ class KeyframeTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   KeyframeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~KeyframeTest() {}
+  ~KeyframeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     kf_count_ = 0;
@@ -33,8 +39,8 @@ class KeyframeTest
     set_cpu_used_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (kf_do_force_kf_) {
       frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF;
     }
@@ -43,7 +49,7 @@ class KeyframeTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
       kf_pts_list_.push_back(pkt->data.frame.pts);
       kf_count_++;
@@ -146,4 +152,105 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
 }
 
 VP8_INSTANTIATE_TEST_SUITE(KeyframeTest, ALL_TEST_MODES);
+
+bool IsVP9(vpx_codec_iface_t *iface) {
+  static const char kVP9Name[] = "WebM Project VP9";
+  return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
+         0;
+}
+
+vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w,
+                             unsigned int h) {
+  vpx_image_t *const image = vpx_img_alloc(nullptr, fmt, w, h, 1);
+  if (!image) return image;
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+  return image;
+}
+
+// Tests kf_max_dist in one-pass encoding with zero lag.
+void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface,
+                                 vpx_enc_deadline_t deadline,
+                                 unsigned int kf_max_dist) {
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
+            VPX_CODEC_OK);
+  cfg.g_w = 320;
+  cfg.g_h = 240;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.kf_mode = VPX_KF_AUTO;
+  cfg.kf_min_dist = 0;
+  cfg.kf_max_dist = kf_max_dist;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  const int speed = IsVP9(iface) ? 9 : -12;
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, speed), VPX_CODEC_OK);
+
+  vpx_image_t *image = CreateGrayImage(VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frames.
+  const vpx_codec_cx_pkt_t *pkt;
+  const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1;
+  for (unsigned int i = 0; i < num_frames; ++i) {
+    ASSERT_EQ(vpx_codec_encode(&enc, image, i, 1, 0, deadline), VPX_CODEC_OK);
+    vpx_codec_iter_t iter = nullptr;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+      if (kf_max_dist == 0 || i % kf_max_dist == 0) {
+        ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY);
+      } else {
+        ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, 0u);
+      }
+    }
+  }
+
+  // Flush the encoder.
+  bool got_data;
+  do {
+    ASSERT_EQ(vpx_codec_encode(&enc, nullptr, 0, 1, 0, deadline), VPX_CODEC_OK);
+    got_data = false;
+    vpx_codec_iter_t iter = nullptr;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+      got_data = true;
+    }
+  } while (got_data);
+
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(KeyframeIntervalTest, KeyframeMaximumInterval) {
+  std::vector<vpx_codec_iface_t *> ifaces;
+#if CONFIG_VP8_ENCODER
+  ifaces.push_back(vpx_codec_vp8_cx());
+#endif
+#if CONFIG_VP9_ENCODER
+  ifaces.push_back(vpx_codec_vp9_cx());
+#endif
+  for (vpx_codec_iface_t *iface : ifaces) {
+    for (vpx_enc_deadline_t deadline :
+         { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) {
+      // Test 0 and 1 (both mean all intra), some powers of 2, some multiples
+      // of 10, and some prime numbers.
+      for (unsigned int kf_max_dist :
+           { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) {
+        TestKeyframeMaximumInterval(iface, deadline, kf_max_dist);
+      }
+    }
+  }
+}
+
 }  // namespace
diff --git a/test/level_test.cc b/test/level_test.cc
index 038d75f44f..36cfd645c9 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -22,9 +22,9 @@ class LevelTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0),
         level_(0) {}
-  virtual ~LevelTest() {}
+  ~LevelTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -41,8 +41,8 @@ class LevelTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_);
@@ -120,7 +120,7 @@ TEST_P(LevelTest, TestTargetLevel255) {
 
 TEST_P(LevelTest, TestTargetLevelApi) {
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1);
-  static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
+  static vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
   vpx_codec_ctx_t enc;
   vpx_codec_enc_cfg_t cfg;
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0));
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 25bba74b0f..ce0ddeae18 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -129,15 +129,15 @@ uint8_t GetHevThresh(ACMRandom *rnd) {
 
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
-  virtual ~Loop8Test6Param() {}
-  virtual void SetUp() {
+  ~Loop8Test6Param() override = default;
+  void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int bit_depth_;
@@ -145,18 +145,21 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
   loop_op_t loopfilter_op_;
   loop_op_t ref_loopfilter_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param);
 
+#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH) || \
+    (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH)
 class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
  public:
-  virtual ~Loop8Test9Param() {}
-  virtual void SetUp() {
+  ~Loop8Test9Param() override = default;
+  void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int bit_depth_;
@@ -164,6 +167,9 @@ class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
   dual_loop_op_t loopfilter_op_;
   dual_loop_op_t ref_loopfilter_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param);
+#endif  // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA &&
+        // (!CONFIG_VP9_HIGHBITDEPTH) || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH))
 
 TEST_P(Loop8Test6Param, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -275,6 +281,8 @@ TEST_P(Loop8Test6Param, ValueCheck) {
       << "First failed at test case " << first_failure;
 }
 
+#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)) || \
+    (HAVE_DSPR2 || HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH))
 TEST_P(Loop8Test9Param, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = number_of_iterations;
@@ -402,6 +410,9 @@ TEST_P(Loop8Test9Param, ValueCheck) {
          "loopfilter output. "
       << "First failed at test case " << first_failure;
 }
+#endif  // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA &&
+        // (!CONFIG_VP9_HIGHBITDEPTH)) || (HAVE_LSX &&
+        // (!CONFIG_VP9_HIGHBITDEPTH))
 
 using std::make_tuple;
 
@@ -682,4 +693,29 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
+#if HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_lsx, &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_lsx, &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_dual_lsx,
+                   &vpx_lpf_horizontal_16_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_lsx, &vpx_lpf_vertical_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c,
+                   8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_lsx,
+                                 &vpx_lpf_horizontal_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_horizontal_8_dual_lsx,
+                                 &vpx_lpf_horizontal_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_4_dual_lsx,
+                                 &vpx_lpf_vertical_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_8_dual_lsx,
+                                 &vpx_lpf_vertical_8_dual_c, 8)));
+#endif  // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
+
 }  // namespace
diff --git a/test/md5_helper.h b/test/md5_helper.h
index dc28dc6283..9095d96a8a 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -47,7 +47,7 @@ class MD5 {
     MD5Update(&md5_, data, static_cast<uint32_t>(size));
   }
 
-  const char *Get(void) {
+  const char *Get() {
     static const char hex[16] = {
       '0', '1', '2', '3', '4', '5', '6', '7',
       '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index 12327bc188..b495709063 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -15,6 +15,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
@@ -28,7 +29,7 @@ typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
 
 class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     mm_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
@@ -115,7 +116,115 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+
+using HBDMinMaxTest = MinMaxTest;
+
+void highbd_reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int *min_ret, int *max_ret) {
+  int min = 65535;
+  int max = 0;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b);
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      const int diff = abs(a_ptr[i * a_stride + j] - b_ptr[i * b_stride + j]);
+      if (min > diff) min = diff;
+      if (max < diff) max = diff;
+    }
+  }
+
+  *min_ret = min;
+  *max_ret = max;
+}
+
+TEST_P(HBDMinMaxTest, MinValue) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int i = 0; i < 64; i++) {
+    vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+    vpx_memset16(CONVERT_TO_SHORTPTR(b), 65535, 64);
+    CONVERT_TO_SHORTPTR(b)[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(65535, max);
+    EXPECT_EQ(i, min);
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, MaxValue) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int i = 0; i < 64; i++) {
+    vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+    vpx_memset16(CONVERT_TO_SHORTPTR(b), 0, 64);
+    CONVERT_TO_SHORTPTR(b)[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(i, max);
+    EXPECT_EQ(0, min);
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, CompareReference) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int j = 0; j < 64; j++) {
+    CONVERT_TO_SHORTPTR(a)[j] = rnd_.Rand16();
+    CONVERT_TO_SHORTPTR(b)[j] = rnd_.Rand16();
+  }
+
+  int min_ref, max_ref, min, max;
+  highbd_reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+  EXPECT_EQ(max_ref, max);
+  EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(HBDMinMaxTest, CompareReferenceAndVaryStride) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t))));
+  for (int i = 0; i < 8 * 64; i++) {
+    CONVERT_TO_SHORTPTR(a)[i] = rnd_.Rand16();
+    CONVERT_TO_SHORTPTR(b)[i] = rnd_.Rand16();
+  }
+  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+      int min_ref, max_ref, min, max;
+      highbd_reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+      EXPECT_EQ(max_ref, max)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+    }
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+#endif
+
 INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest,
+                         ::testing::Values(&vpx_highbd_minmax_8x8_c));
+#endif
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest,
@@ -125,6 +234,10 @@ INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest,
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest,
                          ::testing::Values(&vpx_minmax_8x8_neon));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest,
+                         ::testing::Values(&vpx_highbd_minmax_8x8_neon));
+#endif
 #endif
 
 #if HAVE_MSA
diff --git a/test/non_greedy_mv_test.cc b/test/non_greedy_mv_test.cc
index c78331b285..927029de45 100644
--- a/test/non_greedy_mv_test.cc
+++ b/test/non_greedy_mv_test.cc
@@ -129,11 +129,11 @@ TEST(non_greedy_mv, smooth_mf) {
   const char *ground_truth_file =
       "non_greedy_mv_test_files/ground_truth_16x16.txt";
   BLOCK_SIZE bsize = BLOCK_32X32;
-  MV *search_mf = NULL;
-  MV *smooth_mf = NULL;
-  MV *estimation = NULL;
-  MV *ground_truth = NULL;
-  int(*local_var)[MF_LOCAL_STRUCTURE_SIZE] = NULL;
+  MV *search_mf = nullptr;
+  MV *smooth_mf = nullptr;
+  MV *estimation = nullptr;
+  MV *ground_truth = nullptr;
+  int(*local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr;
   int rows = 0, cols = 0;
 
   int alpha = 100, max_iter = 100;
@@ -169,8 +169,8 @@ TEST(non_greedy_mv, local_var) {
   const char *gt_local_var_file = "non_greedy_mv_test_files/localVar_16x16.txt";
   const char *search_mf_file = "non_greedy_mv_test_files/exhaust_16x16.txt";
   BLOCK_SIZE bsize = BLOCK_16X16;
-  int(*gt_local_var)[MF_LOCAL_STRUCTURE_SIZE] = NULL;
-  int(*est_local_var)[MF_LOCAL_STRUCTURE_SIZE] = NULL;
+  int(*gt_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr;
+  int(*est_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr;
   YV12_BUFFER_CONFIG ref_frame, cur_frame;
   int rows, cols;
   MV *search_mf;
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 61e7c1098b..01e63eb691 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -59,8 +59,8 @@ const int kCountTestBlock = 1000;
 
 class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  public:
-  virtual ~PartialIDctTest() {}
-  virtual void SetUp() {
+  ~PartialIDctTest() override = default;
+  void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     fwd_txfm_ = GET_PARAM(0);
     full_inv_txfm_ = GET_PARAM(1);
@@ -76,7 +76,7 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
       case TX_8X8: size_ = 8; break;
       case TX_16X16: size_ = 16; break;
       case TX_32X32: size_ = 32; break;
-      default: FAIL() << "Wrong Size!"; break;
+      default: FAIL() << "Wrong Size!";
     }
 
     // Randomize stride_ to a value less than or equal to 1024
@@ -100,13 +100,13 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
         vpx_memalign(16, pixel_size_ * output_block_size_));
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(input_block_);
-    input_block_ = NULL;
+    input_block_ = nullptr;
     vpx_free(output_block_);
-    output_block_ = NULL;
+    output_block_ = nullptr;
     vpx_free(output_block_ref_);
-    output_block_ref_ = NULL;
+    output_block_ref_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -954,6 +954,20 @@ INSTANTIATE_TEST_SUITE_P(MSA, PartialIDctTest,
                          ::testing::ValuesIn(msa_partial_idct_tests));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
 
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam lsx_partial_idct_tests[] = {
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1024_add_lsx>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
+             &wrapper<vpx_idct32x32_34_add_lsx>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
+             &wrapper<vpx_idct32x32_1_add_lsx>, TX_32X32, 1, 8, 1),
+};
+
+INSTANTIATE_TEST_SUITE_P(LSX, PartialIDctTest,
+                         ::testing::ValuesIn(lsx_partial_idct_tests));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !CONFIG_EMULATE_HARDWARE
 
 }  // namespace
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 952a954295..d2db8a7c7d 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -7,7 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include <limits.h>
+
+#include <memory>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
@@ -47,10 +51,10 @@ class VpxPostProcDownAndAcrossMbRowTest
  public:
   VpxPostProcDownAndAcrossMbRowTest()
       : mb_post_proc_down_and_across_(GetParam()) {}
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run();
+  void Run() override;
 
   const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_;
   // Size of the underlying data block that will be filtered.
@@ -115,7 +119,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   }
 
   vpx_free(flimits_);
-};
+}
 
 TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // Size of the underlying data block that will be filtered.
@@ -214,7 +218,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) {
   PrintMedian("16x16");
 
   vpx_free(flimits_);
-};
+}
 
 class VpxMbPostProcAcrossIpTest
     : public AbstractBench,
@@ -223,10 +227,10 @@ class VpxMbPostProcAcrossIpTest
   VpxMbPostProcAcrossIpTest()
       : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()),
         src_(Buffer<uint8_t>(rows_, cols_, 8, 8, 17, 8)) {}
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run();
+  void Run() override;
 
   void SetCols(unsigned char *s, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
@@ -352,10 +356,10 @@ class VpxMbPostProcDownTest
       : rows_(16), cols_(16), mb_post_proc_down_(GetParam()),
         src_c_(Buffer<uint8_t>(rows_, cols_, 8, 8, 8, 17)) {}
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run();
+  void Run() override;
 
   void SetRows(unsigned char *src_c, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
@@ -458,14 +462,13 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
 
   SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
 
-  unsigned char *expected_output = new unsigned char[rows_ * cols_];
-  ASSERT_TRUE(expected_output != NULL);
-  SetRows(expected_output, rows_, cols_, cols_);
+  std::unique_ptr<unsigned char[]> expected_output(
+      new unsigned char[rows_ * cols_]);
+  ASSERT_NE(expected_output, nullptr);
+  SetRows(expected_output.get(), rows_, cols_, cols_);
 
   RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0),
-                 expected_output);
-
-  delete[] expected_output;
+                 expected_output.get());
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
diff --git a/test/predict_test.cc b/test/predict_test.cc
index d884932bae..474eab2cb5 100644
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@@ -41,11 +41,11 @@ class PredictTestBase : public AbstractBench,
  public:
   PredictTestBase()
       : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)),
-        src_(NULL), padded_dst_(NULL), dst_(NULL), dst_c_(NULL) {}
+        src_(nullptr), padded_dst_(nullptr), dst_(nullptr), dst_c_(nullptr) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     src_ = new uint8_t[kSrcSize];
-    ASSERT_TRUE(src_ != NULL);
+    ASSERT_NE(src_, nullptr);
 
     // padded_dst_ provides a buffer of kBorderSize around the destination
     // memory to facilitate detecting out of bounds writes.
@@ -53,25 +53,25 @@ class PredictTestBase : public AbstractBench,
     padded_dst_size_ = dst_stride_ * (kBorderSize + height_ + kBorderSize);
     padded_dst_ =
         reinterpret_cast<uint8_t *>(vpx_memalign(16, padded_dst_size_));
-    ASSERT_TRUE(padded_dst_ != NULL);
+    ASSERT_NE(padded_dst_, nullptr);
     dst_ = padded_dst_ + (kBorderSize * dst_stride_) + kBorderSize;
 
     dst_c_ = new uint8_t[16 * 16];
-    ASSERT_TRUE(dst_c_ != NULL);
+    ASSERT_NE(dst_c_, nullptr);
 
     memset(src_, 0, kSrcSize);
     memset(padded_dst_, 128, padded_dst_size_);
     memset(dst_c_, 0, 16 * 16);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete[] src_;
-    src_ = NULL;
+    src_ = nullptr;
     vpx_free(padded_dst_);
-    padded_dst_ = NULL;
-    dst_ = NULL;
+    padded_dst_ = nullptr;
+    dst_ = nullptr;
     delete[] dst_c_;
-    dst_c_ = NULL;
+    dst_c_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -209,7 +209,7 @@ class PredictTestBase : public AbstractBench,
     }
   }
 
-  void Run() {
+  void Run() override {
     for (int xoffset = 0; xoffset < 8; ++xoffset) {
       for (int yoffset = 0; yoffset < 8; ++yoffset) {
         if (xoffset == 0 && yoffset == 0) {
@@ -350,6 +350,14 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi)));
 #endif
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_lsx),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_lsx),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_lsx)));
+#endif
+
 class BilinearPredictTest : public PredictTestBase {};
 
 TEST_P(BilinearPredictTest, TestWithRandomData) {
diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index 96d5241712..ab38f5c1b0 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -46,9 +46,9 @@ class QuantizeTestBase {
  public:
   virtual ~QuantizeTestBase() {
     vp8_remove_compressor(&vp8_comp_);
-    vp8_comp_ = NULL;
+    vp8_comp_ = nullptr;
     vpx_free(macroblockd_dst_);
-    macroblockd_dst_ = NULL;
+    macroblockd_dst_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -121,13 +121,13 @@ class QuantizeTest : public QuantizeTestBase,
                      public ::testing::TestWithParam<VP8QuantizeParam>,
                      public AbstractBench {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     SetupCompressor();
     asm_quant_ = GET_PARAM(0);
     c_quant_ = GET_PARAM(1);
   }
 
-  virtual void Run() {
+  void Run() override {
     asm_quant_(&vp8_comp_->mb.block[0], &macroblockd_dst_->block[0]);
   }
 
@@ -146,6 +146,7 @@ class QuantizeTest : public QuantizeTestBase,
   VP8Quantize asm_quant_;
   VP8Quantize c_quant_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(QuantizeTest);
 
 TEST_P(QuantizeTest, TestZeroInput) {
   FillCoeffConstant(0);
@@ -223,4 +224,11 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
         make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, QuantizeTest,
+    ::testing::Values(make_tuple(&vp8_regular_quantize_b_lsx,
+                                 &vp8_regular_quantize_b_c)));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/test/ratectrl_rtc_test.cc b/test/ratectrl_rtc_test.cc
deleted file mode 100644
index e044885e7c..0000000000
--- a/test/ratectrl_rtc_test.cc
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "vp9/ratectrl_rtc.h"
-
-#include <fstream>  // NOLINT
-#include <string>
-
-#include "./vpx_config.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/util.h"
-#include "test/video_source.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/bitops.h"
-
-namespace {
-
-const size_t kNumFrame = 850;
-
-struct FrameInfo {
-  friend std::istream &operator>>(std::istream &is, FrameInfo &info) {
-    is >> info.frame_id >> info.spatial_id >> info.temporal_id >> info.base_q >>
-        info.target_bandwidth >> info.buffer_level >> info.filter_level_ >>
-        info.bytes_used;
-    return is;
-  }
-  int frame_id;
-  int spatial_id;
-  int temporal_id;
-  // Base QP
-  int base_q;
-  size_t target_bandwidth;
-  size_t buffer_level;
-  // Loopfilter level
-  int filter_level_;
-  // Frame size for current frame, used for pose encode update
-  size_t bytes_used;
-};
-
-// This test runs the rate control interface and compare against ground truth
-// generated by encoders.
-// Settings for the encoder:
-// For 1 layer:
-//
-// examples/vpx_temporal_svc_encoder gipsrec_motion1.1280_720.yuv out vp9
-//    1280 720 1 30 7 0 0 1 0 1000
-//
-// For SVC (3 temporal layers, 3 spatial layers):
-//
-// examples/vp9_spatial_svc_encoder -f 10000 -w 1280 -h 720 -t 1/30 -sl 3
-// -k 10000 -bl 100,140,200,250,350,500,450,630,900 -b 1600 --rc-end-usage=1
-// --lag-in-frames=0 --passes=1 --speed=7 --threads=1
-// --temporal-layering-mode=3 -aq 1 -rcstat 1
-// gipsrec_motion1.1280_720.yuv -o out.webm
-//
-// - AQ_Mode 0
-// - Disable golden refresh
-// - Bitrate x 2 at frame/superframe 200
-// - Bitrate / 4 at frame/superframe 400
-//
-// The generated file includes:
-// frame number, spatial layer ID, temporal layer ID, base QP, target
-// bandwidth, buffer level, loopfilter level, encoded frame size
-// TODO(jianj): Remove golden files, and run actual encoding in this test.
-class RcInterfaceTest : public ::testing::Test {
- public:
-  explicit RcInterfaceTest() {}
-
-  virtual ~RcInterfaceTest() {}
-
- protected:
-  void RunOneLayer() {
-    SetConfigOneLayer();
-    rc_api_->Create(rc_cfg_);
-    FrameInfo frame_info;
-    libvpx::VP9FrameParamsQpRTC frame_params;
-    frame_params.frame_type = KEY_FRAME;
-    frame_params.spatial_layer_id = 0;
-    frame_params.temporal_layer_id = 0;
-    std::ifstream one_layer_file;
-    one_layer_file.open(libvpx_test::GetDataPath() +
-                        "/rc_interface_test_one_layer");
-    ASSERT_EQ(one_layer_file.rdstate() & std::ifstream::failbit, 0);
-    for (size_t i = 0; i < kNumFrame; i++) {
-      one_layer_file >> frame_info;
-      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
-      if (frame_info.frame_id == 200) {
-        rc_cfg_.target_bandwidth = rc_cfg_.target_bandwidth * 2;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      } else if (frame_info.frame_id == 400) {
-        rc_cfg_.target_bandwidth = rc_cfg_.target_bandwidth / 4;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      }
-      ASSERT_EQ(frame_info.spatial_id, 0);
-      ASSERT_EQ(frame_info.temporal_id, 0);
-      rc_api_->ComputeQP(frame_params);
-      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
-      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
-      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
-    }
-  }
-
-  void RunSVC() {
-    SetConfigSVC();
-    rc_api_->Create(rc_cfg_);
-    FrameInfo frame_info;
-    libvpx::VP9FrameParamsQpRTC frame_params;
-    frame_params.frame_type = KEY_FRAME;
-    std::ifstream svc_file;
-    svc_file.open(std::string(std::getenv("LIBVPX_TEST_DATA_PATH")) +
-                  "/rc_interface_test_svc");
-    ASSERT_EQ(svc_file.rdstate() & std::ifstream::failbit, 0);
-    for (size_t i = 0; i < kNumFrame * rc_cfg_.ss_number_layers; i++) {
-      svc_file >> frame_info;
-      if (frame_info.frame_id > 0) frame_params.frame_type = INTER_FRAME;
-      if (frame_info.frame_id == 200 * rc_cfg_.ss_number_layers) {
-        for (int layer = 0;
-             layer < rc_cfg_.ss_number_layers * rc_cfg_.ts_number_layers;
-             layer++)
-          rc_cfg_.layer_target_bitrate[layer] *= 2;
-        rc_cfg_.target_bandwidth *= 2;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      } else if (frame_info.frame_id == 400 * rc_cfg_.ss_number_layers) {
-        for (int layer = 0;
-             layer < rc_cfg_.ss_number_layers * rc_cfg_.ts_number_layers;
-             layer++)
-          rc_cfg_.layer_target_bitrate[layer] /= 4;
-        rc_cfg_.target_bandwidth /= 4;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      }
-      frame_params.spatial_layer_id = frame_info.spatial_id;
-      frame_params.temporal_layer_id = frame_info.temporal_id;
-      rc_api_->ComputeQP(frame_params);
-      ASSERT_EQ(rc_api_->GetQP(), frame_info.base_q);
-      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), frame_info.filter_level_);
-      rc_api_->PostEncodeUpdate(frame_info.bytes_used);
-    }
-  }
-
- private:
-  void SetConfigOneLayer() {
-    rc_cfg_.width = 1280;
-    rc_cfg_.height = 720;
-    rc_cfg_.max_quantizer = 52;
-    rc_cfg_.min_quantizer = 2;
-    rc_cfg_.target_bandwidth = 1000;
-    rc_cfg_.buf_initial_sz = 600;
-    rc_cfg_.buf_optimal_sz = 600;
-    rc_cfg_.buf_sz = 1000;
-    rc_cfg_.undershoot_pct = 50;
-    rc_cfg_.overshoot_pct = 50;
-    rc_cfg_.max_intra_bitrate_pct = 1000;
-    rc_cfg_.framerate = 30.0;
-    rc_cfg_.ss_number_layers = 1;
-    rc_cfg_.ts_number_layers = 1;
-    rc_cfg_.scaling_factor_num[0] = 1;
-    rc_cfg_.scaling_factor_den[0] = 1;
-    rc_cfg_.layer_target_bitrate[0] = 1000;
-    rc_cfg_.max_quantizers[0] = 52;
-    rc_cfg_.min_quantizers[0] = 2;
-  }
-
-  void SetConfigSVC() {
-    rc_cfg_.width = 1280;
-    rc_cfg_.height = 720;
-    rc_cfg_.max_quantizer = 56;
-    rc_cfg_.min_quantizer = 2;
-    rc_cfg_.target_bandwidth = 1600;
-    rc_cfg_.buf_initial_sz = 500;
-    rc_cfg_.buf_optimal_sz = 600;
-    rc_cfg_.buf_sz = 1000;
-    rc_cfg_.undershoot_pct = 50;
-    rc_cfg_.overshoot_pct = 50;
-    rc_cfg_.max_intra_bitrate_pct = 900;
-    rc_cfg_.framerate = 30.0;
-    rc_cfg_.ss_number_layers = 3;
-    rc_cfg_.ts_number_layers = 3;
-
-    rc_cfg_.scaling_factor_num[0] = 1;
-    rc_cfg_.scaling_factor_den[0] = 4;
-    rc_cfg_.scaling_factor_num[1] = 2;
-    rc_cfg_.scaling_factor_den[1] = 4;
-    rc_cfg_.scaling_factor_num[2] = 4;
-    rc_cfg_.scaling_factor_den[2] = 4;
-
-    rc_cfg_.ts_rate_decimator[0] = 4;
-    rc_cfg_.ts_rate_decimator[1] = 2;
-    rc_cfg_.ts_rate_decimator[2] = 1;
-
-    rc_cfg_.layer_target_bitrate[0] = 100;
-    rc_cfg_.layer_target_bitrate[1] = 140;
-    rc_cfg_.layer_target_bitrate[2] = 200;
-    rc_cfg_.layer_target_bitrate[3] = 250;
-    rc_cfg_.layer_target_bitrate[4] = 350;
-    rc_cfg_.layer_target_bitrate[5] = 500;
-    rc_cfg_.layer_target_bitrate[6] = 450;
-    rc_cfg_.layer_target_bitrate[7] = 630;
-    rc_cfg_.layer_target_bitrate[8] = 900;
-
-    for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) {
-      for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) {
-        const int i = sl * rc_cfg_.ts_number_layers + tl;
-        rc_cfg_.max_quantizers[i] = 56;
-        rc_cfg_.min_quantizers[i] = 2;
-      }
-    }
-  }
-
-  std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
-  libvpx::VP9RateControlRtcConfig rc_cfg_;
-};
-
-TEST_F(RcInterfaceTest, OneLayer) { RunOneLayer(); }
-
-TEST_F(RcInterfaceTest, SVC) { RunSVC(); }
-}  // namespace
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index 63a5347d99..a9870b3cbf 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <limits.h>
+
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
@@ -24,7 +26,7 @@ class RealtimeTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
-  ~RealtimeTest() override {}
+  ~RealtimeTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
@@ -33,17 +35,19 @@ class RealtimeTest
   }
 
   void BeginPassHook(unsigned int /*pass*/) override {
+#if !CONFIG_REALTIME_ONLY
     // TODO(tomfinegan): We're changing the pass value here to make sure
     // we get frames when real time mode is combined with |g_pass| set to
     // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
     // the pass value based on the mode passed into EncoderTest::SetMode(),
     // which overrides the one specified in SetUp() above.
     cfg_.g_pass = VPX_RC_FIRST_PASS;
+#endif
   }
 
   void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                           ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
+    if (video->frame() == 0 && set_cpu_used_) {
       encoder->Control(VP8E_SET_CPUUSED, 8);
     }
   }
@@ -52,23 +56,57 @@ class RealtimeTest
     frame_packets_++;
   }
 
+  bool IsVP9() const {
+#if CONFIG_VP9_ENCODER
+    return codec_ == &libvpx_test::kVP9;
+#else
+    return false;
+#endif
+  }
+
+  void TestIntegerOverflow(unsigned int width, unsigned int height) {
+    ::libvpx_test::RandomVideoSource video;
+    video.SetSize(width, height);
+    video.set_limit(20);
+    cfg_.rc_target_bitrate = UINT_MAX;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void TestEncode() {
+    ::libvpx_test::RandomVideoSource video;
+    video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+    video.set_limit(kFramesToEncode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    EXPECT_EQ(kFramesToEncode, frame_packets_);
+  }
+
   int frame_packets_;
+  bool set_cpu_used_ = true;
 };
 
-TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
-  ::libvpx_test::RandomVideoSource video;
-  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
-  video.set_limit(kFramesToEncode);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  EXPECT_EQ(kFramesToEncode, frame_packets_);
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { TestEncode(); }
+
+TEST_P(RealtimeTest, RealtimeDefaultCpuUsed) {
+  set_cpu_used_ = false;
+  TestEncode();
 }
 
-TEST_P(RealtimeTest, IntegerOverflow) {
-  ::libvpx_test::RandomVideoSource video;
-  video.SetSize(800, 480);
-  video.set_limit(20);
-  cfg_.rc_target_bitrate = 140000000;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
+
+TEST_P(RealtimeTest, IntegerOverflowLarge) {
+  if (IsVP9()) {
+#if VPX_ARCH_AARCH64 || VPX_ARCH_X86_64
+    TestIntegerOverflow(16384, 16384);
+#else
+    TestIntegerOverflow(4096, 4096);
+#endif
+  } else {
+    GTEST_SKIP()
+        << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):"
+        << " Enable this test after bitstream errors & undefined sanitizer "
+           "warnings are fixed.";
+    // TestIntegerOverflow(16383, 16383);
+  }
 }
 
 VP8_INSTANTIATE_TEST_SUITE(RealtimeTest,
diff --git a/test/register_state_check.h b/test/register_state_check.h
index 4366466378..ede86ef52f 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -35,6 +35,7 @@
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
+#include <intrin.h>
 #include <windows.h>
 #include <winnt.h>
 
@@ -55,7 +56,7 @@ class RegisterStateCheck {
  private:
   static bool StoreRegisters(CONTEXT *const context) {
     const HANDLE this_thread = GetCurrentThread();
-    EXPECT_TRUE(this_thread != NULL);
+    EXPECT_NE(this_thread, nullptr);
     context->ContextFlags = CONTEXT_FLOATING_POINT;
     const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
     EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
@@ -81,10 +82,13 @@ class RegisterStateCheck {
   CONTEXT pre_context_;
 };
 
-#define ASM_REGISTER_STATE_CHECK(statement)    \
-  do {                                         \
-    libvpx_test::RegisterStateCheck reg_check; \
-    statement;                                 \
+#define ASM_REGISTER_STATE_CHECK(statement)      \
+  do {                                           \
+    {                                            \
+      libvpx_test::RegisterStateCheck reg_check; \
+      statement;                                 \
+    }                                            \
+    _ReadWriteBarrier();                         \
   } while (false)
 
 }  // namespace libvpx_test
@@ -121,11 +125,22 @@ class RegisterStateCheck {
   int64_t pre_store_[8];
 };
 
+#if defined(__GNUC__)
+#define ASM_REGISTER_STATE_CHECK(statement)      \
+  do {                                           \
+    {                                            \
+      libvpx_test::RegisterStateCheck reg_check; \
+      statement;                                 \
+    }                                            \
+    __asm__ volatile("" ::: "memory");           \
+  } while (false)
+#else
 #define ASM_REGISTER_STATE_CHECK(statement)    \
   do {                                         \
     libvpx_test::RegisterStateCheck reg_check; \
     statement;                                 \
   } while (false)
+#endif
 
 }  // namespace libvpx_test
 
@@ -169,10 +184,13 @@ class RegisterStateCheckMMX {
   uint16_t pre_fpu_env_[14];
 };
 
-#define API_REGISTER_STATE_CHECK(statement)       \
-  do {                                            \
-    libvpx_test::RegisterStateCheckMMX reg_check; \
-    ASM_REGISTER_STATE_CHECK(statement);          \
+#define API_REGISTER_STATE_CHECK(statement)             \
+  do {                                                  \
+    {                                                   \
+      libvpx_test::RegisterStateCheckMMX reg_check_mmx; \
+      ASM_REGISTER_STATE_CHECK(statement);              \
+    }                                                   \
+    __asm__ volatile("" ::: "memory");                  \
   } while (false)
 
 }  // namespace libvpx_test
diff --git a/test/resize_test.cc b/test/resize_test.cc
index b2eadb1e8e..20ad2229b4 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -95,22 +95,18 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
                          unsigned int initial_h, unsigned int *w,
                          unsigned int *h, bool flag_codec,
                          bool smaller_width_larger_size_) {
+  *w = initial_w;
+  *h = initial_h;
+
   if (smaller_width_larger_size_) {
     if (frame < 30) {
-      *w = initial_w;
-      *h = initial_h;
-      return;
-    }
-    if (frame < 100) {
-      *w = initial_w * 7 / 10;
-      *h = initial_h * 16 / 10;
       return;
     }
+    *w = initial_w * 7 / 10;
+    *h = initial_h * 16 / 10;
     return;
   }
   if (frame < 10) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 20) {
@@ -124,8 +120,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 40) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 50) {
@@ -139,8 +133,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 70) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 80) {
@@ -159,8 +151,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 110) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 120) {
@@ -179,8 +169,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 150) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 160) {
@@ -199,8 +187,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 190) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 200) {
@@ -219,8 +205,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 230) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 240) {
@@ -234,8 +218,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 260) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   // Go down very low.
@@ -248,13 +230,9 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     // Cases that only works for VP9.
     // For VP9: Swap width and height of original.
     if (frame < 320) {
-      *w = initial_h;
-      *h = initial_w;
       return;
     }
   }
-  *w = initial_w;
-  *h = initial_h;
 }
 
 class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
@@ -266,13 +244,13 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
   }
   bool flag_codec_;
   bool smaller_width_larger_size_;
-  virtual ~ResizingVideoSource() {}
+  ~ResizingVideoSource() override = default;
 
  protected:
-  virtual void Next() {
+  void Next() override {
     ++frame_;
-    unsigned int width;
-    unsigned int height;
+    unsigned int width = 0;
+    unsigned int height = 0;
     ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height,
                         flag_codec_, smaller_width_larger_size_);
     SetSize(width, height);
@@ -286,14 +264,14 @@ class ResizeTest
  protected:
   ResizeTest() : EncoderTest(GET_PARAM(0)) {}
 
-  virtual ~ResizeTest() {}
+  ~ResizeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
     ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
     encode_frame_width_.push_back(pkt->data.frame.width[0]);
@@ -308,8 +286,8 @@ class ResizeTest
     return encode_frame_height_[idx];
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t pts) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
@@ -350,32 +328,32 @@ class ResizeInternalTest : public ResizeTest {
  protected:
 #if WRITE_COMPRESSED_STREAM
   ResizeInternalTest()
-      : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {}
+      : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {}
 #else
   ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeInternalTest() {}
+  ~ResizeInternalTest() override = default;
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
         write_ivf_file_header(&cfg_, out_frames_, outfile_);
       fclose(outfile_);
-      outfile_ = NULL;
+      outfile_ = nullptr;
     }
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (change_config_) {
       int new_q = 60;
       if (video->frame() == 0) {
@@ -400,13 +378,13 @@ class ResizeInternalTest : public ResizeTest {
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -469,10 +447,10 @@ class ResizeRealtimeTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ResizeRealtimeTest() {}
+  ~ResizeRealtimeTest() override = default;
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_AQ_MODE, 3);
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
@@ -485,24 +463,24 @@ class ResizeRealtimeTest
     }
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t pts) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
     ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
     encode_frame_width_.push_back(pkt->data.frame.width[0]);
@@ -578,7 +556,7 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   }
 }
 
-TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) {
+TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) {
   ResizingVideoSource video;
   video.flag_codec_ = true;
   video.smaller_width_larger_size_ = true;
@@ -672,11 +650,11 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
     ASSERT_EQ(info->h, GetFrameHeight(idx));
     if (info->w != last_w || info->h != last_h) {
       resize_count++;
-      if (resize_count == 1) {
+      if (resize_count <= 2) {
         // Verify that resize down occurs.
         ASSERT_LT(info->w, last_w);
         ASSERT_LT(info->h, last_h);
-      } else if (resize_count == 2) {
+      } else if (resize_count > 2) {
         // Verify that resize up occurs.
         ASSERT_GT(info->w, last_w);
         ASSERT_GT(info->h, last_h);
@@ -687,8 +665,8 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   }
 
 #if CONFIG_VP9_DECODER
-  // Verify that we get 2 resize events in this test.
-  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+  // Verify that we get 4 resize events in this test.
+  ASSERT_EQ(resize_count, 4) << "Resizing should occur twice.";
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 #else
   printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
@@ -705,32 +683,32 @@ class ResizeCspTest : public ResizeTest {
  protected:
 #if WRITE_COMPRESSED_STREAM
   ResizeCspTest()
-      : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {}
+      : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {}
 #else
   ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeCspTest() {}
+  ~ResizeCspTest() override = default;
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
         write_ivf_file_header(&cfg_, out_frames_, outfile_);
       fclose(outfile_);
-      outfile_ = NULL;
+      outfile_ = nullptr;
     }
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 &&
         cfg_.g_profile != 1) {
       cfg_.g_profile = 1;
@@ -743,13 +721,13 @@ class ResizeCspTest : public ResizeTest {
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -775,10 +753,10 @@ class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource {
     limit_ = 30;
   }
 
-  virtual ~ResizingCspVideoSource() {}
+  ~ResizingCspVideoSource() override = default;
 
  protected:
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     SetImageFormat(CspForFrameNumber(frame_));
     FillFrame();
@@ -794,8 +772,7 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
 }
 
 VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
-VP9_INSTANTIATE_TEST_SUITE(ResizeTest,
-                           ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
 VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest,
                            ::testing::Values(::libvpx_test::kOnePassBest));
 VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 3211c1b83b..3530e66050 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -26,6 +26,10 @@
 #include "vpx_ports/msvc.h"
 #include "vpx_ports/vpx_timer.h"
 
+// const[expr] should be sufficient for DECLARE_ALIGNED but early
+// implementations of c++11 appear to have some issues with it.
+#define kDataAlignment 32
+
 template <typename Function>
 struct TestParams {
   TestParams(int w, int h, Function f, int bd = -1)
@@ -38,6 +42,10 @@ typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride);
 typedef TestParams<SadMxNFunc> SadMxNParam;
 
+typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride);
+typedef TestParams<SadSkipMxNFunc> SadSkipMxNParam;
+
 typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                       const uint8_t *ref_ptr, int ref_stride,
                                       const uint8_t *second_pred);
@@ -48,12 +56,15 @@ typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
                              unsigned int *sad_array);
 typedef TestParams<SadMxNx4Func> SadMxNx4Param;
 
+typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_ptr[], int ref_stride,
+                                 unsigned int *sad_array);
+typedef TestParams<SadSkipMxNx4Func> SadSkipMxNx4Param;
+
 typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sad_array);
 
-typedef TestParams<SadMxNx8Func> SadMxNx8Param;
-
 using libvpx_test::ACMRandom;
 
 namespace {
@@ -62,7 +73,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
  public:
   explicit SADTestBase(const ParamType &params) : params_(params) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_data8_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBlockSize));
     reference_data8_ = reinterpret_cast<uint8_t *>(
@@ -97,19 +108,19 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data8_);
-    source_data8_ = NULL;
+    source_data8_ = nullptr;
     vpx_free(reference_data8_);
-    reference_data8_ = NULL;
+    reference_data8_ = nullptr;
     vpx_free(second_pred8_);
-    second_pred8_ = NULL;
+    second_pred8_ = nullptr;
     vpx_free(source_data16_);
-    source_data16_ = NULL;
+    source_data16_ = nullptr;
     vpx_free(reference_data16_);
-    reference_data16_ = NULL;
+    reference_data16_ = nullptr;
     vpx_free(second_pred16_);
-    second_pred16_ = NULL;
+    second_pred16_ = nullptr;
 
     libvpx_test::ClearSystemState();
   }
@@ -117,9 +128,6 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
   // crbug.com/webm/1660
-  // const[expr] should be sufficient for DECLARE_ALIGNED but early
-  // implementations of c++11 appear to have some issues with it.
-  enum { kDataAlignment = 32 };
   static const int kDataBlockSize = 64 * 128;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
@@ -171,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
     return sad;
   }
 
+  // Sum of Absolute Differences Skip rows. Given two blocks, calculate the
+  // absolute difference between two pixels in the same relative location every
+  // other row; accumulate and double the result at the end.
+  uint32_t ReferenceSADSkip(int ref_offset) const {
+    uint32_t sad = 0;
+    const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset);
+    const uint8_t *const source8 = source_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < params_.height; h += 2) {
+      for (int w = 0; w < params_.width; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+    return sad * 2;
+  }
+
   // Sum of Absolute Differences Average. Given two blocks, and a prediction
   // calculate the absolute difference between one pixel and average of the
   // corresponding and predicted pixels; accumulate.
@@ -265,33 +301,35 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
   ParamType params_;
 };
 
-class SADx8Test : public SADTestBase<SadMxNx8Param> {
+class SADx4Test : public SADTestBase<SadMxNx4Param> {
  public:
-  SADx8Test() : SADTestBase(GetParam()) {}
+  SADx4Test() : SADTestBase(GetParam()) {}
 
  protected:
   void SADs(unsigned int *results) const {
-    const uint8_t *reference = GetReferenceFromOffset(0);
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
 
     ASM_REGISTER_STATE_CHECK(params_.func(
-        source_data_, source_stride_, reference, reference_stride_, results));
+        source_data_, source_stride_, references, reference_stride_, results));
   }
 
   void CheckSADs() const {
     uint32_t reference_sad;
-    DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[8]);
+    DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
 
     SADs(exp_sad);
-    for (int offset = 0; offset < 8; ++offset) {
-      reference_sad = ReferenceSAD(offset);
-      EXPECT_EQ(reference_sad, exp_sad[offset]) << "offset " << offset;
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSAD(GetBlockRefOffset(block));
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
 };
 
-class SADx4Test : public SADTestBase<SadMxNx4Param> {
+class SADSkipx4Test : public SADTestBase<SadMxNx4Param> {
  public:
-  SADx4Test() : SADTestBase(GetParam()) {}
+  SADSkipx4Test() : SADTestBase(GetParam()) {}
 
  protected:
   void SADs(unsigned int *results) const {
@@ -308,7 +346,7 @@ class SADx4Test : public SADTestBase<SadMxNx4Param> {
 
     SADs(exp_sad);
     for (int block = 0; block < 4; ++block) {
-      reference_sad = ReferenceSAD(GetBlockRefOffset(block));
+      reference_sad = ReferenceSADSkip(GetBlockRefOffset(block));
 
       EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
@@ -336,13 +374,40 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void Run() {
+  void Run() override {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_);
+  }
+};
+
+class SADSkipTest : public AbstractBench, public SADTestBase<SadMxNParam> {
+ public:
+  SADSkipTest() : SADTestBase(GetParam()) {}
+
+ protected:
+  unsigned int SAD(int block_idx) const {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() const {
+    const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0));
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void Run() override {
     params_.func(source_data_, source_stride_, reference_data_,
                  reference_stride_);
   }
 };
 
-class SADavgTest : public SADTestBase<SadMxNAvgParam> {
+class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
  public:
   SADavgTest() : SADTestBase(GetParam()) {}
 
@@ -363,6 +428,11 @@ class SADavgTest : public SADTestBase<SadMxNAvgParam> {
 
     ASSERT_EQ(reference_sad, exp_sad);
   }
+
+  void Run() override {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_, second_pred_);
+  }
 };
 
 TEST_P(SADTest, MaxRef) {
@@ -417,6 +487,58 @@ TEST_P(SADTest, DISABLED_Speed) {
   PrintMedian(title);
 }
 
+TEST_P(SADSkipTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
@@ -462,6 +584,19 @@ TEST_P(SADavgTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADavgTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, params_.width);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(GetReference(0), reference_stride_, mask_);
@@ -542,14 +677,12 @@ TEST_P(SADx4Test, DISABLED_Speed) {
   uint32_t reference_sad[4];
   DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
   vpx_usec_timer timer;
-
-  memset(reference_sad, 0, sizeof(reference_sad));
-  SADs(exp_sad);
+  for (int block = 0; block < 4; ++block) {
+    reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block));
+  }
   vpx_usec_timer_start(&timer);
   for (int i = 0; i < kCountSpeedTestBlock; ++i) {
-    for (int block = 0; block < 4; ++block) {
-      reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block));
-    }
+    SADs(exp_sad);
   }
   vpx_usec_timer_mark(&timer);
   for (int block = 0; block < 4; ++block) {
@@ -563,11 +696,103 @@ TEST_P(SADx4Test, DISABLED_Speed) {
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(SADx8Test, Regular) {
-  FillRandomWH(source_data_, source_stride_, params_.width, params_.height);
-  FillRandomWH(GetReferenceFromOffset(0), reference_stride_, params_.width + 8,
-               params_.height);
+TEST_P(SADSkipx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
   CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += params_.width;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height);
+  uint32_t reference_sad[4];
+  DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+  vpx_usec_timer timer;
+  for (int block = 0; block < 4; ++block) {
+    reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block));
+  }
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    SADs(exp_sad);
+  }
+  vpx_usec_timer_mark(&timer);
+  for (int block = 0; block < 4; ++block) {
+    EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block;
+  }
+  const int elapsed_time =
+      static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+  printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height,
+         bit_depth_, elapsed_time);
+
+  reference_stride_ = tmp_stride;
 }
 
 //------------------------------------------------------------------------------
@@ -630,6 +855,56 @@ const SadMxNParam c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
+const SadSkipMxNParam skip_c_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
+
 const SadMxNAvgParam avg_c_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_c),
@@ -746,23 +1021,56 @@ const SadMxNx4Param x4d_c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
-// TODO(angiebird): implement the marked-down sad functions
-const SadMxNx8Param x8_c_tests[] = {
-  // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c),
-  // SadMxNx8Param(64, 32, &vpx_sad64x32x8_c),
-  // SadMxNx8Param(32, 64, &vpx_sad32x64x8_c),
-  SadMxNx8Param(32, 32, &vpx_sad32x32x8_c),
-  // SadMxNx8Param(32, 16, &vpx_sad32x16x8_c),
-  // SadMxNx8Param(16, 32, &vpx_sad16x32x8_c),
-  SadMxNx8Param(16, 16, &vpx_sad16x16x8_c),
-  SadMxNx8Param(16, 8, &vpx_sad16x8x8_c),
-  SadMxNx8Param(8, 16, &vpx_sad8x16x8_c),
-  SadMxNx8Param(8, 8, &vpx_sad8x8x8_c),
-  // SadMxNx8Param(8, 4, &vpx_sad8x4x8_c),
-  // SadMxNx8Param(4, 8, &vpx_sad4x8x8_c),
-  SadMxNx8Param(4, 4, &vpx_sad4x4x8_c),
+const SadSkipMxNx4Param skip_x4d_c_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_SUITE_P(C, SADx8Test, ::testing::ValuesIn(x8_c_tests));
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_c_tests));
 
 //------------------------------------------------------------------------------
 // ARM functions
@@ -779,9 +1087,137 @@ const SadMxNParam neon_tests[] = {
   SadMxNParam(8, 4, &vpx_sad8x4_neon),
   SadMxNParam(4, 8, &vpx_sad4x8_neon),
   SadMxNParam(4, 4, &vpx_sad4x4_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 8),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 8),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 8),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 8),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 8),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 8),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 8),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 8),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 8),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 8),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 8),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 8),
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 10),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 10),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 10),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 10),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 10),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 10),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 10),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 10),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 10),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 10),
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 12),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 12),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 12),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 12),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 12),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 12),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 12),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 12),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 12),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 12),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
+#if HAVE_NEON_DOTPROD
+const SadMxNParam neon_dotprod_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_neon_dotprod),
+  SadMxNParam(64, 32, &vpx_sad64x32_neon_dotprod),
+  SadMxNParam(32, 64, &vpx_sad32x64_neon_dotprod),
+  SadMxNParam(32, 32, &vpx_sad32x32_neon_dotprod),
+  SadMxNParam(32, 16, &vpx_sad32x16_neon_dotprod),
+  SadMxNParam(16, 32, &vpx_sad16x32_neon_dotprod),
+  SadMxNParam(16, 16, &vpx_sad16x16_neon_dotprod),
+  SadMxNParam(16, 8, &vpx_sad16x8_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest,
+                         ::testing::ValuesIn(neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
+const SadSkipMxNParam skip_neon_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_neon),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon),
+  SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon),
+  SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 8),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 8),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 8),
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 10),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 10),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 10),
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 12),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 12),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 12),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_tests));
+
+#if HAVE_NEON_DOTPROD
+const SadSkipMxNParam skip_neon_dotprod_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon_dotprod),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon_dotprod),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon_dotprod),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon_dotprod),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon_dotprod),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon_dotprod),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon_dotprod),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
 const SadMxNAvgParam avg_neon_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon),
@@ -796,9 +1232,65 @@ const SadMxNAvgParam avg_neon_tests[] = {
   SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon),
   SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon),
   SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 8),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 8),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 8),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 8),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 8),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 8),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 8),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 8),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 8),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 8),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 8),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 8),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 8),
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 10),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 10),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 10),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 10),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 10),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 10),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 10),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 10),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 10),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 10),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 10),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 10),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 10),
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 12),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 12),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 12),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 12),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 12),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 12),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 12),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 12),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 12),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 12),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 12),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 12),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
 
+#if HAVE_NEON_DOTPROD
+const SadMxNAvgParam avg_neon_dotprod_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon_dotprod),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon_dotprod),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon_dotprod),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon_dotprod),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon_dotprod),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon_dotprod),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon_dotprod),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest,
+                         ::testing::ValuesIn(avg_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
 const SadMxNx4Param x4d_neon_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon),
   SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon),
@@ -813,8 +1305,132 @@ const SadMxNx4Param x4d_neon_tests[] = {
   SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
   SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon),
   SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 8),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 8),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 8),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 8),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 8),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 8),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 8),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 8),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 8),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 8),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 8),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 8),
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 10),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 10),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 10),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 10),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 10),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 10),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 10),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 10),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 10),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 10),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 10),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 10),
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 12),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 12),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 12),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 12),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 12),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 12),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 12),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 12),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 12),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 12),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 12),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+
+#if HAVE_NEON_DOTPROD
+const SadMxNx4Param x4d_neon_dotprod_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon_dotprod),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon_dotprod),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon_dotprod),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon_dotprod),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon_dotprod),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon_dotprod),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon_dotprod),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test,
+                         ::testing::ValuesIn(x4d_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
+const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_neon),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_neon),
+  SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon),
+  SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 8),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 8),
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 10),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 10),
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 12),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 12),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_tests));
+
+#if HAVE_NEONE_DOTPROD
+const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon_dotprod),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
@@ -872,6 +1488,54 @@ const SadMxNParam sse2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
+const SadSkipMxNParam skip_sse2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+                         ::testing::ValuesIn(skip_sse2_tests));
+
 const SadMxNAvgParam avg_sse2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_sse2),
@@ -981,6 +1645,57 @@ const SadMxNx4Param x4d_sse2_tests[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_sse2_tests));
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE3
@@ -991,18 +1706,6 @@ INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 // Only functions are x3, which do not have tests.
 #endif  // HAVE_SSSE3
 
-#if HAVE_SSE4_1
-const SadMxNx8Param x8_sse4_1_tests[] = {
-  SadMxNx8Param(16, 16, &vpx_sad16x16x8_sse4_1),
-  SadMxNx8Param(16, 8, &vpx_sad16x8x8_sse4_1),
-  SadMxNx8Param(8, 16, &vpx_sad8x16x8_sse4_1),
-  SadMxNx8Param(8, 8, &vpx_sad8x8x8_sse4_1),
-  SadMxNx8Param(4, 4, &vpx_sad4x4x8_sse4_1),
-};
-INSTANTIATE_TEST_SUITE_P(SSE4_1, SADx8Test,
-                         ::testing::ValuesIn(x8_sse4_1_tests));
-#endif  // HAVE_SSE4_1
-
 #if HAVE_AVX2
 const SadMxNParam avx2_tests[] = {
   SadMxNParam(64, 64, &vpx_sad64x64_avx2),
@@ -1010,29 +1713,178 @@ const SadMxNParam avx2_tests[] = {
   SadMxNParam(32, 64, &vpx_sad32x64_avx2),
   SadMxNParam(32, 32, &vpx_sad32x32_avx2),
   SadMxNParam(32, 16, &vpx_sad32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
+const SadSkipMxNParam skip_avx2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+                         ::testing::ValuesIn(skip_avx2_tests));
+
 const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx2),
   SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_avx2),
   SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2),
   SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 8),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 8),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 10),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 10),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 12),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 12),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
 const SadMxNx4Param x4d_avx2_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 
-const SadMxNx8Param x8_avx2_tests[] = {
-  // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c),
-  SadMxNx8Param(32, 32, &vpx_sad32x32x8_avx2),
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_SUITE_P(AVX2, SADx8Test, ::testing::ValuesIn(x8_avx2_tests));
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_avx2_tests));
+
 #endif  // HAVE_AVX2
 
 #if HAVE_AVX512
@@ -1195,4 +2047,33 @@ const SadMxNx4Param x4d_mmi_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
 #endif  // HAVE_MMI
+
+//------------------------------------------------------------------------------
+// loongarch functions
+#if HAVE_LSX
+const SadMxNParam lsx_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_lsx),
+  SadMxNParam(32, 32, &vpx_sad32x32_lsx),
+  SadMxNParam(16, 16, &vpx_sad16x16_lsx),
+  SadMxNParam(8, 8, &vpx_sad8x8_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests));
+
+const SadMxNAvgParam avg_lsx_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_lsx),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests));
+
+const SadMxNx4Param x4d_lsx_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_lsx),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_lsx),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests));
+#endif  // HAVE_LSX
+
 }  // namespace
diff --git a/test/set_roi.cc b/test/set_roi.cc
index f639547523..693410e391 100644
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -40,7 +40,7 @@ TEST(VP8RoiMapTest, ParameterCheck) {
 
   // Initialize elements of cpi with valid defaults.
   VP8_COMP cpi;
-  cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA;
+  cpi.mb.e_mbd.mb_segment_abs_delta = SEGMENT_DELTADATA;
   cpi.cyclic_refresh_mode_enabled = 0;
   cpi.mb.e_mbd.segmentation_enabled = 0;
   cpi.mb.e_mbd.update_mb_segmentation_map = 0;
@@ -161,6 +161,6 @@ TEST(VP8RoiMapTest, ParameterCheck) {
   // Free allocated memory
   if (cpi.segmentation_map) vpx_free(cpi.segmentation_map);
   if (roi_map) vpx_free(roi_map);
-};
+}
 
 }  // namespace
diff --git a/test/simple_encode_test.cc b/test/simple_encode_test.cc
index f37a6d0764..01fc258566 100644
--- a/test/simple_encode_test.cc
+++ b/test/simple_encode_test.cc
@@ -10,22 +10,15 @@
 
 #include <math.h>
 #include <memory>
+#include <string>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/video_source.h"
 #include "vp9/simple_encode.h"
 
 namespace vp9 {
 namespace {
 
-// TODO(angirbid): Find a better way to construct encode info
-const int w = 352;
-const int h = 288;
-const int frame_rate_num = 30;
-const int frame_rate_den = 1;
-const int target_bitrate = 1000;
-const int num_frames = 17;
-const char infile_path[] = "bus_352x288_420_f20_b8.yuv";
-
 double GetBitrateInKbps(size_t bit_size, int num_frames, int frame_rate_num,
                         int frame_rate_den) {
   return static_cast<double>(bit_size) / num_frames * frame_rate_num /
@@ -36,14 +29,28 @@ double GetBitrateInKbps(size_t bit_size, int num_frames, int frame_rate_num,
 // For example, if size is 7, return 2.
 int GetNumUnit4x4(int size) { return (size + 3) >> 2; }
 
-TEST(SimpleEncode, ComputeFirstPassStats) {
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+class SimpleEncodeTest : public ::testing::Test {
+ protected:
+  const int width_ = 352;
+  const int height_ = 288;
+  const int frame_rate_num_ = 30;
+  const int frame_rate_den_ = 1;
+  const int target_bitrate_ = 1000;
+  const int num_frames_ = 17;
+  const int target_level_ = LEVEL_UNKNOWN;
+  const std::string in_file_path_str_ =
+      libvpx_test::GetDataPath() + "/bus_352x288_420_f20_b8.yuv";
+};
+
+TEST_F(SimpleEncodeTest, ComputeFirstPassStats) {
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   std::vector<std::vector<double>> frame_stats =
       simple_encode.ObserveFirstPassStats();
-  EXPECT_EQ(frame_stats.size(), static_cast<size_t>(num_frames));
-  size_t data_num = frame_stats[0].size();
+  EXPECT_EQ(frame_stats.size(), static_cast<size_t>(num_frames_));
+  const size_t data_num = frame_stats[0].size();
   // Read ObserveFirstPassStats before changing FIRSTPASS_STATS.
   EXPECT_EQ(data_num, static_cast<size_t>(25));
   for (size_t i = 0; i < frame_stats.size(); ++i) {
@@ -56,25 +63,49 @@ TEST(SimpleEncode, ComputeFirstPassStats) {
   }
 }
 
-TEST(SimpleEncode, GetCodingFrameNum) {
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+TEST_F(SimpleEncodeTest, ObserveFirstPassMotionVectors) {
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
-  int num_coding_frames = simple_encode.GetCodingFrameNum();
+  std::vector<std::vector<MotionVectorInfo>> fps_motion_vectors =
+      simple_encode.ObserveFirstPassMotionVectors();
+  EXPECT_EQ(fps_motion_vectors.size(), static_cast<size_t>(num_frames_));
+  const size_t num_blocks = ((width_ + 15) >> 4) * ((height_ + 15) >> 4);
+  EXPECT_EQ(num_blocks, fps_motion_vectors[0].size());
+  for (size_t i = 0; i < fps_motion_vectors.size(); ++i) {
+    EXPECT_EQ(num_blocks, fps_motion_vectors[i].size());
+    for (size_t j = 0; j < num_blocks; ++j) {
+      const int mv_count = fps_motion_vectors[i][j].mv_count;
+      const int ref_count =
+          (fps_motion_vectors[i][j].ref_frame[0] != kRefFrameTypeNone) +
+          (fps_motion_vectors[i][j].ref_frame[1] != kRefFrameTypeNone);
+      EXPECT_EQ(mv_count, ref_count);
+    }
+  }
+}
+
+TEST_F(SimpleEncodeTest, GetCodingFrameNum) {
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
+  simple_encode.ComputeFirstPassStats();
+  const int num_coding_frames = simple_encode.GetCodingFrameNum();
   EXPECT_EQ(num_coding_frames, 19);
 }
 
-TEST(SimpleEncode, EncodeFrame) {
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+TEST_F(SimpleEncodeTest, EncodeFrame) {
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   int num_coding_frames = simple_encode.GetCodingFrameNum();
-  EXPECT_GE(num_coding_frames, num_frames);
+  EXPECT_GE(num_coding_frames, num_frames_);
   simple_encode.StartEncode();
   size_t total_data_bit_size = 0;
   int coded_show_frame_count = 0;
   int frame_coding_index = 0;
-  while (coded_show_frame_count < num_frames) {
+  while (coded_show_frame_count < num_frames_) {
     const GroupOfPicture group_of_picture =
         simple_encode.ObserveGroupOfPicture();
     const std::vector<EncodeFrameInfo> &encode_frame_list =
@@ -99,22 +130,23 @@ TEST(SimpleEncode, EncodeFrame) {
     }
     coded_show_frame_count += group_of_picture.show_frame_count;
   }
-  const double bitrate = GetBitrateInKbps(total_data_bit_size, num_frames,
-                                          frame_rate_num, frame_rate_den);
+  const double bitrate = GetBitrateInKbps(total_data_bit_size, num_frames_,
+                                          frame_rate_num_, frame_rate_den_);
   const double off_target_threshold = 150;
-  EXPECT_LE(fabs(target_bitrate - bitrate), off_target_threshold);
+  EXPECT_LE(fabs(target_bitrate_ - bitrate), off_target_threshold);
   simple_encode.EndEncode();
 }
 
-TEST(SimpleEncode, ObserveKeyFrameMap) {
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+TEST_F(SimpleEncodeTest, ObserveKeyFrameMap) {
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   std::vector<int> key_frame_map = simple_encode.ObserveKeyFrameMap();
-  EXPECT_EQ(key_frame_map.size(), static_cast<size_t>(num_frames));
+  EXPECT_EQ(key_frame_map.size(), static_cast<size_t>(num_frames_));
   simple_encode.StartEncode();
   int coded_show_frame_count = 0;
-  while (coded_show_frame_count < num_frames) {
+  while (coded_show_frame_count < num_frames_) {
     const GroupOfPicture group_of_picture =
         simple_encode.ObserveGroupOfPicture();
     const std::vector<EncodeFrameInfo> &encode_frame_list =
@@ -134,11 +166,50 @@ TEST(SimpleEncode, ObserveKeyFrameMap) {
   simple_encode.EndEncode();
 }
 
-TEST(SimpleEncode, EncodeFrameWithQuantizeIndex) {
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) {
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
-  int num_coding_frames = simple_encode.GetCodingFrameNum();
+  const int num_coding_frames = simple_encode.GetCodingFrameNum();
+  simple_encode.StartEncode();
+  for (int i = 0; i < num_coding_frames; ++i) {
+    EncodeFrameInfo encode_frame_info = simple_encode.GetNextEncodeFrameInfo();
+    int target_frame_bits;
+    switch (encode_frame_info.frame_type) {
+      case kFrameTypeInter: target_frame_bits = 20000; break;
+      case kFrameTypeKey:
+      case kFrameTypeAltRef:
+      case kFrameTypeGolden: target_frame_bits = 100000; break;
+      case kFrameTypeOverlay: target_frame_bits = 2000; break;
+      default: target_frame_bits = 20000;
+    }
+
+    double percent_diff = 15;
+    if (encode_frame_info.frame_type == kFrameTypeOverlay) {
+      percent_diff = 100;
+    }
+    EncodeFrameResult encode_frame_result;
+    simple_encode.EncodeFrameWithTargetFrameBits(
+        &encode_frame_result, target_frame_bits, percent_diff);
+    const int recode_count = encode_frame_result.recode_count;
+    // TODO(angiebird): Replace 7 by RATE_CTRL_MAX_RECODE_NUM
+    EXPECT_LE(recode_count, 7);
+    EXPECT_GE(recode_count, 1);
+
+    const double diff = fabs((double)encode_frame_result.coding_data_bit_size -
+                             target_frame_bits);
+    EXPECT_LE(diff * 100 / target_frame_bits, percent_diff);
+  }
+  simple_encode.EndEncode();
+}
+
+TEST_F(SimpleEncodeTest, EncodeFrameWithQuantizeIndex) {
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
+  simple_encode.ComputeFirstPassStats();
+  const int num_coding_frames = simple_encode.GetCodingFrameNum();
   simple_encode.StartEncode();
   for (int i = 0; i < num_coding_frames; ++i) {
     const int assigned_quantize_index = 100 + i;
@@ -150,15 +221,24 @@ TEST(SimpleEncode, EncodeFrameWithQuantizeIndex) {
   simple_encode.EndEncode();
 }
 
-TEST(SimpleEncode, EncodeConsistencyTest) {
+// This test encodes the video using EncodeFrame(), where quantize indexes
+// are selected by vp9 rate control.
+// Encode stats and the quantize_indexes are collected.
+// Then the test encodes the video again using EncodeFrameWithQuantizeIndex()
+// using the quantize indexes collected from the first run.
+// Then test whether the encode stats of the two encoding runs match.
+TEST_F(SimpleEncodeTest, EncodeConsistencyTest) {
   std::vector<int> quantize_index_list;
   std::vector<uint64_t> ref_sse_list;
   std::vector<double> ref_psnr_list;
   std::vector<size_t> ref_bit_size_list;
+  std::vector<FrameType> ref_frame_type_list;
+  std::vector<int> ref_show_idx_list;
   {
     // The first encode.
-    SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                               target_bitrate, num_frames, infile_path);
+    SimpleEncode simple_encode(width_, height_, frame_rate_num_,
+                               frame_rate_den_, target_bitrate_, num_frames_,
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     const int num_coding_frames = simple_encode.GetCodingFrameNum();
     simple_encode.StartEncode();
@@ -169,13 +249,16 @@ TEST(SimpleEncode, EncodeConsistencyTest) {
       ref_sse_list.push_back(encode_frame_result.sse);
       ref_psnr_list.push_back(encode_frame_result.psnr);
       ref_bit_size_list.push_back(encode_frame_result.coding_data_bit_size);
+      ref_frame_type_list.push_back(encode_frame_result.frame_type);
+      ref_show_idx_list.push_back(encode_frame_result.show_idx);
     }
     simple_encode.EndEncode();
   }
   {
     // The second encode with quantize index got from the first encode.
-    SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                               target_bitrate, num_frames, infile_path);
+    SimpleEncode simple_encode(width_, height_, frame_rate_num_,
+                               frame_rate_den_, target_bitrate_, num_frames_,
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     const int num_coding_frames = simple_encode.GetCodingFrameNum();
     EXPECT_EQ(static_cast<size_t>(num_coding_frames),
@@ -189,6 +272,8 @@ TEST(SimpleEncode, EncodeConsistencyTest) {
       EXPECT_EQ(encode_frame_result.sse, ref_sse_list[i]);
       EXPECT_DOUBLE_EQ(encode_frame_result.psnr, ref_psnr_list[i]);
       EXPECT_EQ(encode_frame_result.coding_data_bit_size, ref_bit_size_list[i]);
+      EXPECT_EQ(encode_frame_result.frame_type, ref_frame_type_list[i]);
+      EXPECT_EQ(encode_frame_result.show_idx, ref_show_idx_list[i]);
     }
     simple_encode.EndEncode();
   }
@@ -196,13 +281,14 @@ TEST(SimpleEncode, EncodeConsistencyTest) {
 
 // Test the information (partition info and motion vector info) stored in
 // encoder is the same between two encode runs.
-TEST(SimpleEncode, EncodeConsistencyTest2) {
-  const int num_rows_4x4 = GetNumUnit4x4(w);
-  const int num_cols_4x4 = GetNumUnit4x4(h);
+TEST_F(SimpleEncodeTest, EncodeConsistencyTest2) {
+  const int num_rows_4x4 = GetNumUnit4x4(width_);
+  const int num_cols_4x4 = GetNumUnit4x4(height_);
   const int num_units_4x4 = num_rows_4x4 * num_cols_4x4;
   // The first encode.
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
   std::vector<PartitionInfo> partition_info_list(num_units_4x4 *
@@ -222,8 +308,9 @@ TEST(SimpleEncode, EncodeConsistencyTest2) {
   }
   simple_encode.EndEncode();
   // The second encode.
-  SimpleEncode simple_encode_2(w, h, frame_rate_num, frame_rate_den,
-                               target_bitrate, num_frames, infile_path);
+  SimpleEncode simple_encode_2(width_, height_, frame_rate_num_,
+                               frame_rate_den_, target_bitrate_, num_frames_,
+                               target_level_, in_file_path_str_.c_str());
   simple_encode_2.ComputeFirstPassStats();
   const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum();
   simple_encode_2.StartEncode();
@@ -264,14 +351,15 @@ TEST(SimpleEncode, EncodeConsistencyTest2) {
 }
 
 // Test the information stored in encoder is the same between two encode runs.
-TEST(SimpleEncode, EncodeConsistencyTest3) {
+TEST_F(SimpleEncodeTest, EncodeConsistencyTest3) {
   std::vector<int> quantize_index_list;
-  const int num_rows_4x4 = GetNumUnit4x4(w);
-  const int num_cols_4x4 = GetNumUnit4x4(h);
+  const int num_rows_4x4 = GetNumUnit4x4(width_);
+  const int num_cols_4x4 = GetNumUnit4x4(height_);
   const int num_units_4x4 = num_rows_4x4 * num_cols_4x4;
   // The first encode.
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
   std::vector<PartitionInfo> partition_info_list(num_units_4x4 *
@@ -288,8 +376,9 @@ TEST(SimpleEncode, EncodeConsistencyTest3) {
   }
   simple_encode.EndEncode();
   // The second encode.
-  SimpleEncode simple_encode_2(w, h, frame_rate_num, frame_rate_den,
-                               target_bitrate, num_frames, infile_path);
+  SimpleEncode simple_encode_2(width_, height_, frame_rate_num_,
+                               frame_rate_den_, target_bitrate_, num_frames_,
+                               target_level_, in_file_path_str_.c_str());
   simple_encode_2.ComputeFirstPassStats();
   const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum();
   simple_encode_2.StartEncode();
@@ -319,21 +408,22 @@ TEST(SimpleEncode, EncodeConsistencyTest3) {
 // Get QPs and arf locations from the first encode.
 // Set external arfs and QPs for the second encode.
 // Expect to get matched results.
-TEST(SimpleEncode, EncodeConsistencySetExternalGroupOfPicturesMap) {
+TEST_F(SimpleEncodeTest, EncodeConsistencySetExternalGroupOfPicturesMap) {
   std::vector<int> quantize_index_list;
   std::vector<uint64_t> ref_sse_list;
   std::vector<double> ref_psnr_list;
   std::vector<size_t> ref_bit_size_list;
-  std::vector<int> gop_map(num_frames, 0);
+  std::vector<int> gop_map(num_frames_, 0);
   {
     // The first encode.
-    SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                               target_bitrate, num_frames, infile_path);
+    SimpleEncode simple_encode(width_, height_, frame_rate_num_,
+                               frame_rate_den_, target_bitrate_, num_frames_,
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     simple_encode.StartEncode();
 
     int coded_show_frame_count = 0;
-    while (coded_show_frame_count < num_frames) {
+    while (coded_show_frame_count < num_frames_) {
       const GroupOfPicture group_of_picture =
           simple_encode.ObserveGroupOfPicture();
       gop_map[coded_show_frame_count] |= kGopMapFlagStart;
@@ -358,10 +448,11 @@ TEST(SimpleEncode, EncodeConsistencySetExternalGroupOfPicturesMap) {
   {
     // The second encode with quantize index got from the first encode.
     // The external arfs are the same as the first encode.
-    SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                               target_bitrate, num_frames, infile_path);
+    SimpleEncode simple_encode(width_, height_, frame_rate_num_,
+                               frame_rate_den_, target_bitrate_, num_frames_,
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
-    simple_encode.SetExternalGroupOfPicturesMap(gop_map);
+    simple_encode.SetExternalGroupOfPicturesMap(gop_map.data(), gop_map.size());
     const int num_coding_frames = simple_encode.GetCodingFrameNum();
     EXPECT_EQ(static_cast<size_t>(num_coding_frames),
               quantize_index_list.size());
@@ -379,12 +470,13 @@ TEST(SimpleEncode, EncodeConsistencySetExternalGroupOfPicturesMap) {
   }
 }
 
-TEST(SimpleEncode, SetExternalGroupOfPicturesMap) {
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+TEST_F(SimpleEncodeTest, SetExternalGroupOfPicturesMap) {
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
 
-  std::vector<int> gop_map(num_frames, 0);
+  std::vector<int> gop_map(num_frames_, 0);
 
   // Should be the first gop group.
   gop_map[0] = 0;
@@ -398,7 +490,7 @@ TEST(SimpleEncode, SetExternalGroupOfPicturesMap) {
   // Last gop group.
   gop_map[14] |= kGopMapFlagStart | kGopMapFlagUseAltRef;
 
-  simple_encode.SetExternalGroupOfPicturesMap(gop_map);
+  simple_encode.SetExternalGroupOfPicturesMap(gop_map.data(), gop_map.size());
 
   std::vector<int> observed_gop_map =
       simple_encode.ObserveExternalGroupOfPicturesMap();
@@ -425,7 +517,7 @@ TEST(SimpleEncode, SetExternalGroupOfPicturesMap) {
 
   simple_encode.StartEncode();
   int coded_show_frame_count = 0;
-  while (coded_show_frame_count < num_frames) {
+  while (coded_show_frame_count < num_frames_) {
     const GroupOfPicture group_of_picture =
         simple_encode.ObserveGroupOfPicture();
     const std::vector<EncodeFrameInfo> &encode_frame_list =
@@ -446,11 +538,12 @@ TEST(SimpleEncode, SetExternalGroupOfPicturesMap) {
   simple_encode.EndEncode();
 }
 
-TEST(SimpleEncode, GetEncodeFrameInfo) {
+TEST_F(SimpleEncodeTest, GetEncodeFrameInfo) {
   // Makes sure that the encode_frame_info obtained from GetEncodeFrameInfo()
   // matches the counterpart in encode_frame_result obtained from EncodeFrame()
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
   simple_encode.StartEncode();
@@ -464,11 +557,12 @@ TEST(SimpleEncode, GetEncodeFrameInfo) {
   simple_encode.EndEncode();
 }
 
-TEST(SimpleEncode, GetFramePixelCount) {
-  SimpleEncode simple_encode(w, h, frame_rate_num, frame_rate_den,
-                             target_bitrate, num_frames, infile_path);
+TEST_F(SimpleEncodeTest, GetFramePixelCount) {
+  SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
+                             target_bitrate_, num_frames_, target_level_,
+                             in_file_path_str_.c_str());
   EXPECT_EQ(simple_encode.GetFramePixelCount(),
-            static_cast<uint64_t>(w * h * 3 / 2));
+            static_cast<uint64_t>(width_ * height_ * 3 / 2));
 }
 
 }  // namespace
diff --git a/test/stress.sh b/test/stress.sh
index b11c658a96..ba79a52ac3 100755
--- a/test/stress.sh
+++ b/test/stress.sh
@@ -18,7 +18,7 @@
 YUV="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.yuv"
 VP8="${LIBVPX_TEST_DATA_PATH}/tos_vp8.webm"
 VP9="${LIBVPX_TEST_DATA_PATH}/vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm"
-DATA_URL="http://downloads.webmproject.org/test_data/libvpx/"
+DATA_URL="https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/"
 SHA1_FILE="$(dirname $0)/test-data.sha1"
 
 # Set sha1sum to proper sha program (sha1sum, shasum, sha1). This code is
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index a47f5b5b1b..d3c76a34d2 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -9,6 +9,7 @@
  */
 
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
 #include <string>
 #include <tuple>
@@ -21,9 +22,14 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::ValuesIn;
 
 namespace {
 const int kNumIterations = 10000;
@@ -33,18 +39,19 @@ typedef std::tuple<SSI16Func, SSI16Func> SumSquaresParam;
 
 class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
  public:
-  virtual ~SumSquaresTest() {}
-  virtual void SetUp() {
+  ~SumSquaresTest() override = default;
+  void SetUp() override {
     ref_func_ = GET_PARAM(0);
     tst_func_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   SSI16Func ref_func_;
   SSI16Func tst_func_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest);
 
 TEST_P(SumSquaresTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -125,4 +132,210 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
                                  &vpx_sum_squares_2d_i16_msa)));
 #endif  // HAVE_MSA
+
+typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int width, int height);
+
+struct TestSSEFuncs {
+  TestSSEFuncs(SSEFunc ref = nullptr, SSEFunc tst = nullptr, int depth = 0)
+      : ref_func(ref), tst_func(tst), bit_depth(depth) {}
+  SSEFunc ref_func;  // Pointer to reference function
+  SSEFunc tst_func;  // Pointer to tested function
+  int bit_depth;
+};
+
+typedef std::tuple<TestSSEFuncs, int> SSETestParam;
+
+class SSETest : public ::testing::TestWithParam<SSETestParam> {
+ public:
+  ~SSETest() override = default;
+  void SetUp() override {
+    params_ = GET_PARAM(0);
+    width_ = GET_PARAM(1);
+    is_hbd_ =
+#if CONFIG_VP9_HIGHBITDEPTH
+        params_.ref_func == vpx_highbd_sse_c;
+#else
+        false;
+#endif
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2));
+    ref_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2));
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(ref_, nullptr);
+  }
+
+  void TearDown() override {
+    vpx_free(src_);
+    vpx_free(ref_);
+  }
+  void RunTest(bool is_random, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride) {
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(src_);
+    uint16_t *ref16 = reinterpret_cast<uint16_t *>(ref_);
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!is_hbd_) {
+          src_[ii * stride + jj] = rnd_.Rand8();
+          ref_[ii * stride + jj] = rnd_.Rand8();
+        } else {
+          src16[ii * stride + jj] = rnd_(limit);
+          ref16[ii * stride + jj] = rnd_(limit);
+        }
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, uint8_t *data,
+                      int16_t val) {
+    uint16_t *data16 = reinterpret_cast<uint16_t *>(data);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!is_hbd_) {
+          data[ii * stride + jj] = static_cast<uint8_t>(val);
+        } else {
+          data16[ii * stride + jj] = val;
+        }
+      }
+    }
+  }
+
+ protected:
+  bool is_hbd_;
+  int width_;
+  TestSSEFuncs params_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
+
+void SSETest::RunTest(bool is_random, int width, int height, int run_times) {
+  int failed = 0;
+  vpx_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    int stride = 4 << rnd_(7);  // Up to 256 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (is_random) {
+      GenRandomData(width, height, stride);
+    } else {
+      const int msb = is_hbd_ ? 12 : 8;  // Up to 12 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src_, 0);
+        GenExtremeData(width, height, stride, ref_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src_, limit);
+        GenExtremeData(width, height, stride, ref_, 0);
+      }
+    }
+    int64_t res_ref, res_tst;
+    uint8_t *src = src_;
+    uint8_t *ref = ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (is_hbd_) {
+      src = CONVERT_TO_BYTEPTR(src_);
+      ref = CONVERT_TO_BYTEPTR(ref_);
+    }
+#endif
+    res_ref = params_.ref_func(src, stride, ref, stride, width, height);
+    res_tst = params_.tst_func(src, stride, ref, stride, width, height);
+    if (run_times > 1) {
+      vpx_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(src, stride, ref, stride, width, height);
+      }
+      vpx_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(vpx_usec_timer_elapsed(&ref_timer));
+
+      vpx_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(src, stride, ref, stride, width, height);
+      }
+      vpx_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(vpx_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%d\n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      if (!failed) {
+        failed = res_ref != res_tst;
+        EXPECT_EQ(res_ref, res_tst)
+            << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test ["
+            << width << "x" << height
+            << "] C output does not match optimized output.";
+      }
+    }
+  }
+}
+
+TEST_P(SSETest, OperationCheck) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(true, width_, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(SSETest, ExtremeValues) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(false, width_, height, 1);
+  }
+}
+
+TEST_P(SSETest, DISABLED_Speed) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(true, width_, height, 100);
+  }
+}
+
+#if HAVE_NEON
+TestSSEFuncs sse_neon[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_neon)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
+                         Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
+#endif  // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+TestSSEFuncs sse_neon_dotprod[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest,
+                         Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4)));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_SSE4_1
+TestSSEFuncs sse_sse4[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_sse4_1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_sse4_1)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest,
+                         Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+TestSSEFuncs sse_avx2[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_avx2)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SSETest,
+                         Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 7b2ce29eb8..4c3aa1625a 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -27,10 +27,10 @@ class SuperframeTest
       public ::libvpx_test::CodecTestWithParam<SuperframeTestParam> {
  protected:
   SuperframeTest()
-      : EncoderTest(GET_PARAM(0)), modified_buf_(NULL), last_sf_pts_(0) {}
-  virtual ~SuperframeTest() {}
+      : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {}
+  ~SuperframeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     const SuperframeTestParam input = GET_PARAM(1);
     const libvpx_test::TestMode mode = std::get<kTestMode>(input);
@@ -39,17 +39,17 @@ class SuperframeTest
     sf_count_max_ = INT_MAX;
   }
 
-  virtual void TearDown() { delete[] modified_buf_; }
+  void TearDown() override { delete[] modified_buf_; }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
     }
   }
 
-  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
-      const vpx_codec_cx_pkt_t *pkt) {
+  const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
 
     const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 0a7d0032c1..aff4ace843 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -43,7 +43,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   }
 
  protected:
-  virtual ~DatarateOnePassCbrSvc() {}
+  ~DatarateOnePassCbrSvc() override = default;
 
   virtual void ResetModel() {
     last_pts_ = 0;
@@ -84,16 +84,18 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       prev_frame_width[i] = 320;
       prev_frame_height[i] = 240;
     }
+    ksvc_flex_noupd_tlenh_ = false;
   }
-  virtual void BeginPassHook(unsigned int /*pass*/) {}
+  void BeginPassHook(unsigned int /*pass*/) override {}
 
   // Example pattern for spatial layers and 2 temporal layers used in the
   // bypass/flexible mode. The pattern corresponds to the pattern
   // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
   // non-flexible mode, except that we disable inter-layer prediction.
-  void set_frame_flags_bypass_mode(
-      int tl, int num_spatial_layers, int is_key_frame,
-      vpx_svc_ref_frame_config_t *ref_frame_config) {
+  void set_frame_flags_bypass_mode(int tl, int num_spatial_layers,
+                                   int is_key_frame,
+                                   vpx_svc_ref_frame_config_t *ref_frame_config,
+                                   int noupdate_tlenh) {
     for (int sl = 0; sl < num_spatial_layers; ++sl)
       ref_frame_config->update_buffer_slot[sl] = 0;
 
@@ -151,9 +153,12 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
           ref_frame_config->reference_last[sl] = 1;
           ref_frame_config->reference_golden[sl] = 0;
           ref_frame_config->reference_alt_ref[sl] = 0;
-          ref_frame_config->update_buffer_slot[sl] |=
-              1 << ref_frame_config->alt_fb_idx[sl];
+          // Non reference frame on top temporal top spatial.
+          ref_frame_config->update_buffer_slot[sl] = 0;
         }
+        // Force no update on all spatial layers for temporal enhancement layer
+        // frames.
+        if (noupdate_tlenh) ref_frame_config->update_buffer_slot[sl] = 0;
       }
     }
   }
@@ -174,8 +179,8 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     PreEncodeFrameHookSetup(video, encoder);
 
     if (video->frame() == 0) {
@@ -244,6 +249,22 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       }
     }
 
+    if (ksvc_flex_noupd_tlenh_) {
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++) {
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config_.duration[i] = 1;
+      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0, &ref_frame_config_,
+                                  1);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    }
+
     if (update_pattern_ && video->frame() >= 100) {
       vpx_svc_layer_id_t layer_id;
       if (video->frame() == 100) {
@@ -254,12 +275,15 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       layer_id.spatial_layer_id = 0;
       layer_id.temporal_layer_id = (video->frame() % 2 != 0);
       temporal_layer_id_ = layer_id.temporal_layer_id;
-      for (int i = 0; i < number_spatial_layers_; i++)
+      for (int i = 0; i < number_spatial_layers_; i++) {
         layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config_.duration[i] = 1;
+      }
       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
       set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
-                                  number_spatial_layers_, 0, &ref_frame_config);
-      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+                                  number_spatial_layers_, 0, &ref_frame_config_,
+                                  0);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
     }
 
     if (change_bitrate_ && video->frame() == 200) {
@@ -444,7 +468,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     return VPX_CODEC_OK;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     uint32_t sizes[8] = { 0 };
     uint32_t sizes_parsed[8] = { 0 };
     int count = 0;
@@ -524,13 +548,16 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       }
 
       if (!single_layer_resize_) {
-        ASSERT_EQ(pkt->data.frame.width[sl],
-                  top_sl_width_ * svc_params_.scaling_factor_num[sl] /
-                      svc_params_.scaling_factor_den[sl]);
-
-        ASSERT_EQ(pkt->data.frame.height[sl],
-                  top_sl_height_ * svc_params_.scaling_factor_num[sl] /
-                      svc_params_.scaling_factor_den[sl]);
+        unsigned int scaled_width = top_sl_width_ *
+                                    svc_params_.scaling_factor_num[sl] /
+                                    svc_params_.scaling_factor_den[sl];
+        if (scaled_width % 2 != 0) scaled_width += 1;
+        ASSERT_EQ(pkt->data.frame.width[sl], scaled_width);
+        unsigned int scaled_height = top_sl_height_ *
+                                     svc_params_.scaling_factor_num[sl] /
+                                     svc_params_.scaling_factor_den[sl];
+        if (scaled_height % 2 != 0) scaled_height += 1;
+        ASSERT_EQ(pkt->data.frame.height[sl], scaled_height);
       } else if (superframe_count_ > 0) {
         if (pkt->data.frame.width[sl] < prev_frame_width[sl] &&
             pkt->data.frame.height[sl] < prev_frame_height[sl])
@@ -544,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
   }
 
-  virtual void EndPassHook(void) {
+  void EndPassHook() override {
     if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_;
     duration_ = (last_pts_ + 1) * timebase_;
     for (int sl = 0; sl < number_spatial_layers_; ++sl) {
@@ -556,10 +583,15 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
-    double mismatch_psnr = compute_psnr(img1, img2);
-    mismatch_psnr_ += mismatch_psnr;
-    ++mismatch_nframes_;
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
+    // TODO(marpan): Look into why an assert is triggered in compute_psnr
+    // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh.
+    // Has to do with dropped frames in bypass/flexible svc mode.
+    if (!ksvc_flex_noupd_tlenh_) {
+      double mismatch_psnr = compute_psnr(img1, img2);
+      mismatch_psnr_ += mismatch_psnr;
+      ++mismatch_nframes_;
+    }
   }
 
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
@@ -579,7 +611,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   bool single_layer_resize_;
   unsigned int top_sl_width_;
   unsigned int top_sl_height_;
-  vpx_svc_ref_frame_config_t ref_frame_config;
+  vpx_svc_ref_frame_config_t ref_frame_config_;
   int update_pattern_;
   bool change_bitrate_;
   vpx_codec_pts_t last_pts_ref_;
@@ -604,9 +636,10 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   int num_resize_down_;
   unsigned int prev_frame_width[VPX_MAX_LAYERS];
   unsigned int prev_frame_height[VPX_MAX_LAYERS];
+  bool ksvc_flex_noupd_tlenh_;
 
  private:
-  virtual void SetConfig(const int num_temporal_layer) {
+  void SetConfig(const int num_temporal_layer) override {
     cfg_.rc_end_usage = VPX_CBR;
     cfg_.g_lag_in_frames = 0;
     cfg_.g_error_resilient = 1;
@@ -637,10 +670,10 @@ class DatarateOnePassCbrSvcSingleBR
   DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcSingleBR() {}
+  ~DatarateOnePassCbrSvcSingleBR() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -648,6 +681,152 @@ class DatarateOnePassCbrSvcSingleBR
   }
 };
 
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for 4:4:4 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL444Profile1) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+  cfg_.g_profile = 1;
+  cfg_.g_bit_depth = VPX_BITS_8;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 352;
+  top_sl_height_ = 288;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers, for 4:2:2 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) {
+  SetSvcConfig(2, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_8_422.y4m", 0, 20);
+  cfg_.g_profile = 1;
+  cfg_.g_bit_depth = VPX_BITS_8;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 10bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL10bitProfile2) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_10_420_20f.y4m", 0, 20);
+  cfg_.g_profile = 2;
+  cfg_.g_bit_depth = VPX_BITS_10;
+  cfg_.g_input_bit_depth = VPX_BITS_10;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(marpan/jianj): Comment out the rate-target checking for now
+  // as superframe parsing to get frame size needs to be fixed for
+  // high bitdepth.
+  /*
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+  */
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 12bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL12bitProfile2) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_12_420_20f.y4m", 0, 20);
+  cfg_.g_profile = 2;
+  cfg_.g_bit_depth = VPX_BITS_12;
+  cfg_.g_input_bit_depth = VPX_BITS_12;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(marpan/jianj): Comment out the rate-target checking for now
+  // as superframe parsing to get frame size needs to be fixed for
+  // high bitdepth.
+  /*
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+  */
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+#endif
+
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
 // temporal layer, with screen content mode on and same speed setting for all
 // layers.
@@ -722,14 +901,14 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL2TLDynamicPatternChange) {
   cfg_.g_threads = 1;
   cfg_.rc_dropframe_thresh = 30;
   cfg_.kf_max_dist = 9999;
-  // Change SVC pattern on the fly.
-  update_pattern_ = 1;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   top_sl_width_ = 640;
   top_sl_height_ = 480;
   cfg_.rc_target_bitrate = 800;
   ResetModel();
+  // Change SVC pattern on the fly.
+  update_pattern_ = 1;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
@@ -981,10 +1160,10 @@ class DatarateOnePassCbrSvcMultiBR
   DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcMultiBR() {}
+  ~DatarateOnePassCbrSvcMultiBR() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1024,6 +1203,37 @@ TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) {
 #endif
 }
 
+// Check basic rate targeting for 1 pass VBR SVC: 2 spatial layers and
+// 3 temporal layers. Run VGA clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassVbrSvc2SL3TL) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_VBR;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)];
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
+                          1.3);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
 // Params: speed setting, layer framedrop control and index for bitrate array.
 class DatarateOnePassCbrSvcFrameDropMultiBR
     : public DatarateOnePassCbrSvc,
@@ -1033,10 +1243,10 @@ class DatarateOnePassCbrSvcFrameDropMultiBR
       : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcFrameDropMultiBR() {}
+  ~DatarateOnePassCbrSvcFrameDropMultiBR() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1106,6 +1316,36 @@ TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL3TL4Threads) {
 #endif
 }
 
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 2 temporal layers, for KSVC in flexible mode with no update of reference
+// frames for all spatial layers on TL > 0 superframes.
+// Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL2TL4ThKSVCFlex) {
+  SetSvcConfig(3, 2);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ksvc_flex_noupd_tlenh_ = true;
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58,
+                          1.2);
+}
+
 // Params: speed setting, inter-layer prediction mode.
 class DatarateOnePassCbrSvcInterLayerPredSingleBR
     : public DatarateOnePassCbrSvc,
@@ -1115,10 +1355,10 @@ class DatarateOnePassCbrSvcInterLayerPredSingleBR
       : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcInterLayerPredSingleBR() {}
+  ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1201,10 +1441,10 @@ class DatarateOnePassCbrSvcDenoiser
   DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcDenoiser() {}
+  ~DatarateOnePassCbrSvcDenoiser() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1259,10 +1499,10 @@ class DatarateOnePassCbrSvcSmallKF
   DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcSmallKF() {}
+  ~DatarateOnePassCbrSvcSmallKF() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1294,7 +1534,6 @@ TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc3SL3TLSmallKf) {
   ResetModel();
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  // TODO(jianj): webm:1554
   CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
                           1.15);
 #if CONFIG_VP9_DECODER
@@ -1463,10 +1702,10 @@ class DatarateOnePassCbrSvcPostencodeDrop
   DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcPostencodeDrop() {}
+  ~DatarateOnePassCbrSvcPostencodeDrop() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc
index edd4c887fc..b4337ae754 100644
--- a/test/svc_end_to_end_test.cc
+++ b/test/svc_end_to_end_test.cc
@@ -15,6 +15,7 @@
 #include "test/svc_test.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "vp9/common/vp9_onyxc_int.h"
 #include "vpx/vpx_codec.h"
 #include "vpx_ports/bitops.h"
 
@@ -44,19 +45,19 @@ class ScalePartitionOnePassCbrSvc
   }
 
  protected:
-  virtual ~ScalePartitionOnePassCbrSvc() {}
+  ~ScalePartitionOnePassCbrSvc() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     speed_setting_ = 7;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     PreEncodeFrameHookSetup(video, encoder);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Keep track of number of non-reference frames, needed for mismatch check.
     // Non-reference frames are top spatial and temporal layer frames,
     // for TL > 0.
@@ -66,12 +67,12 @@ class ScalePartitionOnePassCbrSvc
       num_nonref_frames_++;
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     ++mismatch_nframes_;
   }
 
-  virtual void SetConfig(const int /*num_temporal_layer*/) {}
+  void SetConfig(const int /*num_temporal_layer*/) override {}
 
   unsigned int GetMismatchFrames() const { return mismatch_nframes_; }
   unsigned int GetNonRefFrames() const { return num_nonref_frames_; }
@@ -121,25 +122,111 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
         frame_to_start_decode_(0), frame_to_sync_(0),
         inter_layer_pred_mode_(GET_PARAM(1)), decode_to_layer_before_sync_(-1),
         decode_to_layer_after_sync_(-1), denoiser_on_(0),
-        intra_only_test_(false), mismatch_nframes_(0), num_nonref_frames_(0) {
+        intra_only_test_(false), loopfilter_off_(0), mismatch_nframes_(0),
+        num_nonref_frames_(0) {
     SetMode(::libvpx_test::kRealTime);
     memset(&svc_layer_sync_, 0, sizeof(svc_layer_sync_));
   }
 
  protected:
-  virtual ~SyncFrameOnePassCbrSvc() {}
+  ~SyncFrameOnePassCbrSvc() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     speed_setting_ = 7;
   }
 
-  virtual bool DoDecode() const {
+  bool DoDecode() const override {
     return current_video_frame_ >= frame_to_start_decode_;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  // Example pattern for spatial layers and 2 temporal layers used in the
+  // bypass/flexible mode. The pattern corresponds to the pattern
+  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+  // non-flexible mode.
+  void set_frame_flags_bypass_mode(
+      int tl, int num_spatial_layers, int is_key_frame,
+      vpx_svc_ref_frame_config_t *ref_frame_config) {
+    int sl;
+    for (sl = 0; sl < num_spatial_layers; ++sl)
+      ref_frame_config->update_buffer_slot[sl] = 0;
+
+    for (sl = 0; sl < num_spatial_layers; ++sl) {
+      // Set the buffer idx.
+      if (tl == 0) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        if (sl) {
+          if (is_key_frame) {
+            ref_frame_config->lst_fb_idx[sl] = sl - 1;
+            ref_frame_config->gld_fb_idx[sl] = sl;
+          } else {
+            ref_frame_config->gld_fb_idx[sl] = sl - 1;
+          }
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = 0;
+        }
+        ref_frame_config->alt_fb_idx[sl] = 0;
+      } else if (tl == 1) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        ref_frame_config->gld_fb_idx[sl] =
+            (sl == 0) ? 0 : num_spatial_layers + sl - 1;
+        ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+      }
+      // Set the reference and update flags.
+      if (!tl) {
+        if (!sl) {
+          // Base spatial and base temporal (sl = 0, tl = 0)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->lst_fb_idx[sl];
+        } else {
+          if (is_key_frame) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 0;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->gld_fb_idx[sl];
+          } else {
+            // Non-zero spatiall layer.
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 1;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->lst_fb_idx[sl];
+          }
+        }
+      } else if (tl == 1) {
+        if (!sl) {
+          // Base spatial and top temporal (tl = 1)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->alt_fb_idx[sl];
+        } else {
+          // Non-zero spatial.
+          if (sl < num_spatial_layers - 1) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->alt_fb_idx[sl];
+          } else if (sl == num_spatial_layers - 1) {
+            // Top spatial and top temporal (non-reference -- doesn't
+            // update any reference buffers).
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+          }
+        }
+      }
+    }
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     current_video_frame_ = video->frame();
     PreEncodeFrameHookSetup(video, encoder);
     if (video->frame() == 0) {
@@ -154,6 +241,23 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
         // So set it here in these tess to avoid encoder-decoder
         // mismatch check on color space setting.
         encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601);
+
+      encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_);
+    }
+    if (flexible_mode_) {
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++) {
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config_.duration[i] = 1;
+      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0,
+                                  &ref_frame_config_);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
     }
     if (video->frame() == frame_to_sync_) {
       encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_);
@@ -161,21 +265,26 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   }
 
 #if CONFIG_VP9_DECODER
-  virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Decoder *decoder) override {
     if (video->frame() < frame_to_sync_) {
       if (decode_to_layer_before_sync_ >= 0)
         decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
                          decode_to_layer_before_sync_);
     } else {
-      if (decode_to_layer_after_sync_ >= 0)
-        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
-                         decode_to_layer_after_sync_);
+      if (decode_to_layer_after_sync_ >= 0) {
+        int decode_to_layer = decode_to_layer_after_sync_;
+        // Overlay frame is additional layer for intra-only.
+        if (video->frame() == frame_to_sync_ && intra_only_test_ &&
+            decode_to_layer_after_sync_ == 0 && number_spatial_layers_ > 1)
+          decode_to_layer += 1;
+        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, decode_to_layer);
+      }
     }
   }
 #endif
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Keep track of number of non-reference frames, needed for mismatch check.
     // Non-reference frames are top spatial and temporal layer frames,
     // for TL > 0.
@@ -198,8 +307,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
     }
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_;
   }
 
@@ -214,10 +323,15 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   int decode_to_layer_after_sync_;
   int denoiser_on_;
   bool intra_only_test_;
+  int loopfilter_off_;
   vpx_svc_spatial_layer_sync_t svc_layer_sync_;
+  unsigned int mismatch_nframes_;
+  unsigned int num_nonref_frames_;
+  bool flexible_mode_;
+  vpx_svc_ref_frame_config_t ref_frame_config_;
 
  private:
-  virtual void SetConfig(const int num_temporal_layer) {
+  void SetConfig(const int num_temporal_layer) override {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
     cfg_.rc_buf_sz = 1000;
@@ -240,12 +354,9 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
       cfg_.temporal_layering_mode = 2;
     } else if (num_temporal_layer == 1) {
       cfg_.ts_rate_decimator[0] = 1;
-      cfg_.temporal_layering_mode = 1;
+      cfg_.temporal_layering_mode = 0;
     }
   }
-
-  unsigned int mismatch_nframes_;
-  unsigned int num_nonref_frames_;
 };
 
 // Test for sync layer for 1 pass CBR SVC: 3 spatial layers and
@@ -267,6 +378,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLFullSync) {
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
 
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -294,6 +406,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncToVGA) {
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   cfg_.rc_target_bitrate = 400;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -321,6 +434,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToHD) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -348,6 +462,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToVGAHD) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -377,6 +492,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) {
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   cfg_.rc_target_bitrate = 400;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -387,6 +503,61 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) {
 }
 #endif
 
+// Encode 3 spatial, 2 temporal layer in flexible mode but don't
+// start decoding. During the sequence insert intra-only on base/qvga
+// layer at frame 20 and start decoding only QVGA layer from there.
+TEST_P(SyncFrameOnePassCbrSvc,
+       OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGAFlex) {
+  SetSvcConfig(3, 2);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = true;
+  AssignLayerBitrates();
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Can't check mismatch here because only base is decoded at
+  // frame sync, whereas encoder continues encoding all layers.
+}
+
+// Encode 3 spatial, 3 temporal layer but don't start decoding.
+// During the sequence insert intra-only on base/qvga layer at frame 20
+// and start decoding only QVGA layer from there.
+TEST_P(SyncFrameOnePassCbrSvc,
+       OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGA) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Can't check mismatch here because only base is decoded at
+  // frame sync, whereas encoder continues encoding all layers.
+}
+
 // Start decoding from beginning of sequence, during sequence insert intra-only
 // on base/qvga layer. Decode all layers.
 TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) {
@@ -394,8 +565,9 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) {
   frame_to_start_decode_ = 0;
   frame_to_sync_ = 20;
   decode_to_layer_before_sync_ = 2;
-  // The superframe containing intra-only layer will have 4 frames. Thus set the
-  // layer to decode after sync frame to 3.
+  // The superframe containing intra-only layer will have +1 frames. Thus set
+  // the layer to decode after sync frame to +1 from
+  // decode_to_layer_before_sync.
   decode_to_layer_after_sync_ = 3;
   intra_only_test_ = true;
 
@@ -407,6 +579,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -423,8 +596,9 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) {
   frame_to_start_decode_ = 0;
   frame_to_sync_ = 20;
   decode_to_layer_before_sync_ = 2;
-  // The superframe containing intra-only layer will have 4 frames. Thus set the
-  // layer to decode after sync frame to 3.
+  // The superframe containing intra-only layer will have +1 frames. Thus set
+  // the layer to decode after sync frame to +1 from
+  // decode_to_layer_before_sync.
   decode_to_layer_after_sync_ = 3;
   intra_only_test_ = true;
 
@@ -436,6 +610,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -461,6 +636,7 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) {
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -470,8 +646,176 @@ TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) {
 #endif
 }
 
+// Params: Loopfilter modes.
+class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
+                                public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  LoopfilterOnePassCbrSvc()
+      : OnePassCbrSvc(GET_PARAM(0)), loopfilter_off_(GET_PARAM(1)),
+        mismatch_nframes_(0), num_nonref_frames_(0) {
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+ protected:
+  ~LoopfilterOnePassCbrSvc() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    speed_setting_ = 7;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    PreEncodeFrameHookSetup(video, encoder);
+    if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) {
+      // Consider 3 cases:
+      if (loopfilter_off_ == 0) {
+        // loopfilter is on for all spatial layers on every superrframe.
+        for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+          svc_params_.loopfilter_ctrl[i] = 0;
+        }
+      } else if (loopfilter_off_ == 1) {
+        // loopfilter is off for non-reference frames for all spatial layers.
+        for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+          svc_params_.loopfilter_ctrl[i] = 1;
+        }
+      } else {
+        // loopfilter is off for all SL0 frames, and off only for non-reference
+        // frames for SL > 0.
+        svc_params_.loopfilter_ctrl[0] = 2;
+        for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
+          svc_params_.loopfilter_ctrl[i] = 1;
+        }
+      }
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+    } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 1) {
+      // For non-SVC mode use the single layer control.
+      encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_);
+    }
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    // Keep track of number of non-reference frames, needed for mismatch check.
+    // Non-reference frames are top spatial and temporal layer frames,
+    // for TL > 0.
+    if (temporal_layer_id_ == number_temporal_layers_ - 1 &&
+        temporal_layer_id_ > 0 &&
+        pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1])
+      num_nonref_frames_++;
+  }
+
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
+    ++mismatch_nframes_;
+  }
+
+  void SetConfig(const int /*num_temporal_layer*/) override {}
+
+  int GetMismatchFrames() const { return mismatch_nframes_; }
+  int GetNonRefFrames() const { return num_nonref_frames_; }
+
+  int loopfilter_off_;
+
+ private:
+  int mismatch_nframes_;
+  int num_nonref_frames_;
+};
+
+TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL1TLLoopfilterOff) {
+  SetSvcConfig(1, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.temporal_layering_mode = 0;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  if (loopfilter_off_ == 0)
+    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+  else
+    EXPECT_EQ(GetMismatchFrames(), 0);
+#endif
+}
+
+TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL3TLLoopfilterOff) {
+  SetSvcConfig(1, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.temporal_layering_mode = 3;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  if (loopfilter_off_ == 0)
+    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+  else
+    EXPECT_EQ(GetMismatchFrames(), 0);
+#endif
+}
+
+TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc3SL3TLLoopfilterOff) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.temporal_layering_mode = 3;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  if (loopfilter_off_ == 0)
+    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+  else
+    EXPECT_EQ(GetMismatchFrames(), 0);
+#endif
+}
+
 VP9_INSTANTIATE_TEST_SUITE(SyncFrameOnePassCbrSvc, ::testing::Range(0, 3));
 
+VP9_INSTANTIATE_TEST_SUITE(LoopfilterOnePassCbrSvc, ::testing::Range(0, 3));
+
 INSTANTIATE_TEST_SUITE_P(
     VP9, ScalePartitionOnePassCbrSvc,
     ::testing::Values(
diff --git a/test/svc_test.cc b/test/svc_test.cc
index 4798c77183..cbc0abe032 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -43,13 +43,14 @@ void OnePassCbrSvc::PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video,
       svc_params_.max_quantizers[i] = 63;
       svc_params_.min_quantizers[i] = 0;
     }
-    svc_params_.speed_per_layer[0] = base_speed_setting_;
-    for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
-      svc_params_.speed_per_layer[i] = speed_setting_;
+    if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) {
+      svc_params_.speed_per_layer[0] = base_speed_setting_;
+      for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
+        svc_params_.speed_per_layer[i] = speed_setting_;
+      }
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
     }
-
-    encoder->Control(VP9E_SET_SVC, 1);
-    encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
     encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
     encoder->Control(VP9E_SET_AQ_MODE, 3);
     encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
diff --git a/test/svc_test.h b/test/svc_test.h
index f1d727fd9d..0026372de5 100644
--- a/test/svc_test.h
+++ b/test/svc_test.h
@@ -36,7 +36,7 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest {
   }
 
  protected:
-  virtual ~OnePassCbrSvc() {}
+  ~OnePassCbrSvc() override {}
 
   virtual void SetConfig(const int num_temporal_layer) = 0;
 
@@ -46,11 +46,11 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest {
   virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video,
                                        ::libvpx_test::Encoder *encoder);
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder);
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override;
 
   virtual void AssignLayerBitrates();
 
-  virtual void MismatchHook(const vpx_image_t *, const vpx_image_t *) {}
+  void MismatchHook(const vpx_image_t *, const vpx_image_t *) override {}
 
   vpx_svc_extra_cfg_t svc_params_;
   int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
diff --git a/test/test-data.mk b/test/test-data.mk
index ca2e11442e..9eabffae3e 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -6,6 +6,7 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288_nv12.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktop_office1.1280_720-020.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += slides_code_term_web_plot.1920_1080.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktopqvga.320_240.yuv
 
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420_20f.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422_20f.y4m
@@ -27,7 +28,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_RATE_CTRL) += bus_352x288_420_f20_b8.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += crowd_run_360p_10_150f.y4m
 
 # Test vectors
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 668992fba2..a9decc6b6b 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -869,3 +869,5 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv
 518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res
 ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
+8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv
+ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
diff --git a/test/test.mk b/test/test.mk
index 5aecb013f3..d4521f08bf 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -7,6 +7,8 @@ LIBVPX_TEST_SRCS-yes += codec_factory.h
 LIBVPX_TEST_SRCS-yes += md5_helper.h
 LIBVPX_TEST_SRCS-yes += register_state_check.h
 LIBVPX_TEST_SRCS-yes += test.mk
+LIBVPX_TEST_SRCS-yes += init_vpx_test.cc
+LIBVPX_TEST_SRCS-yes += init_vpx_test.h
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
 LIBVPX_TEST_SRCS-yes += test_vectors.cc
 LIBVPX_TEST_SRCS-yes += test_vectors.h
@@ -22,10 +24,6 @@ LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += altref_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += alt_ref_aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp8_datarate_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp9_datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
@@ -37,6 +35,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_datarate_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
@@ -44,6 +43,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += alt_ref_aq_segment_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += aq_segment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
@@ -58,6 +59,9 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
@@ -83,6 +87,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc
+$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc
@@ -121,11 +126,13 @@ ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
 LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp8_fragments_test.cc
 endif
-
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += add_noise_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
+ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_SSSE3) $(HAVE_SSE4_1) $(HAVE_NEON) \
+                       $(HAVE_MSA) $(HAVE_MMI)))
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
@@ -153,6 +160,7 @@ LIBVPX_TEST_SRCS-yes                   += superframe_test.cc
 LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_roi_test.cc
 endif
 
 LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
@@ -174,7 +182,9 @@ ifneq ($(CONFIG_REALTIME_ONLY),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
+ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON)))
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 
@@ -201,12 +211,23 @@ endif # VP9
 ## Multi-codec / unconditional whitebox tests.
 
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
+ifneq (, $(filter yes, $(HAVE_NEON) $(HAVE_SSE2) $(HAVE_MSA)))
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sum_squares_test.cc
+endif
 
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
-
-RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) := ratectrl_rtc_test.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.h
+
+RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes += codec_factory.h
 
 endif # CONFIG_SHARED
 
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 0be9feefd9..4c464a262f 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -14,9 +14,11 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
+#include "test/init_vpx_test.h"
 #include "test/md5_helper.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -48,11 +50,9 @@ struct IntraPredTestMem {
     for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask;
     for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask;
 
-    // some code assumes the top row has been extended:
-    // d45/d63 C-code, for instance, but not the assembly.
-    // TODO(jzern): this style of extension isn't strictly necessary.
+    // d45/d63 require the top row to be extended.
     ASSERT_LE(block_size, kBPS);
-    for (int i = block_size; i < 2 * kBPS; ++i) {
+    for (int i = block_size; i < 2 * block_size; ++i) {
       above[i] = above[block_size - 1];
     }
   }
@@ -85,7 +85,7 @@ void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs,
   intra_pred_test_mem.Init(block_size, 8);
 
   for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
-    if (pred_funcs[k] == NULL) continue;
+    if (pred_funcs[k] == nullptr) continue;
     memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
            sizeof(intra_pred_test_mem.src));
     vpx_usec_timer timer;
@@ -206,58 +206,64 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
 INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
                 vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
                 vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
-                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
-                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
+                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, nullptr,
+                nullptr, nullptr, vpx_d207_predictor_4x4_sse2, nullptr,
                 vpx_tm_predictor_4x4_sse2)
 
 INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
                 vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
                 vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
-                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
-                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
+                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, nullptr,
+                nullptr, nullptr, nullptr, nullptr, vpx_tm_predictor_8x8_sse2)
 
 INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
                 vpx_dc_left_predictor_16x16_sse2,
                 vpx_dc_top_predictor_16x16_sse2,
                 vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
-                vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_16x16_sse2)
+                vpx_h_predictor_16x16_sse2, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_16x16_sse2)
 
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
                 vpx_dc_left_predictor_32x32_sse2,
                 vpx_dc_top_predictor_32x32_sse2,
                 vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_sse2)
+                vpx_h_predictor_32x32_sse2, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL,
-                vpx_d63_predictor_4x4_ssse3, NULL)
-INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_d153_predictor_8x8_ssse3,
-                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
-INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_d45_predictor_16x16_ssse3, NULL, NULL,
-                vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3,
-                vpx_d63_predictor_16x16_ssse3, NULL)
-INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_d45_predictor_32x32_ssse3, NULL, NULL,
-                vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
-                vpx_d63_predictor_32x32_ssse3, NULL)
+INTRA_PRED_TEST(SSSE3, TestIntraPred4, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr,
+                vpx_d153_predictor_4x4_ssse3, nullptr,
+                vpx_d63_predictor_4x4_ssse3, nullptr)
+INTRA_PRED_TEST(SSSE3, TestIntraPred8, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr,
+                vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
+                vpx_d63_predictor_8x8_ssse3, nullptr)
+INTRA_PRED_TEST(SSSE3, TestIntraPred16, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_d45_predictor_16x16_ssse3, nullptr,
+                nullptr, vpx_d153_predictor_16x16_ssse3,
+                vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3,
+                nullptr)
+INTRA_PRED_TEST(SSSE3, TestIntraPred32, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_d45_predictor_32x32_ssse3, nullptr,
+                nullptr, vpx_d153_predictor_32x32_ssse3,
+                vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3,
+                nullptr)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
-                NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_tm_predictor_4x4_dspr2)
-INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
-                NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_tm_predictor_8x8_c)
-INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
-                NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, nullptr,
+                nullptr, nullptr, nullptr, vpx_h_predictor_4x4_dspr2, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr,
+                vpx_tm_predictor_4x4_dspr2)
+INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, nullptr,
+                nullptr, nullptr, nullptr, vpx_h_predictor_8x8_dspr2, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr,
+                vpx_tm_predictor_8x8_c)
+INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, nullptr,
+                nullptr, nullptr, nullptr, vpx_h_predictor_16x16_dspr2, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
@@ -265,81 +271,94 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
                 vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
                 vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
                 vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
-                vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_4x4_neon)
+                vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon,
+                vpx_d153_predictor_4x4_neon, vpx_d207_predictor_4x4_neon,
+                vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                 vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                 vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
                 vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
-                vpx_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_8x8_neon)
+                vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon,
+                vpx_d153_predictor_8x8_neon, vpx_d207_predictor_8x8_neon,
+                vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
                 vpx_dc_top_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
-                vpx_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_16x16_neon)
+                vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon,
+                vpx_d153_predictor_16x16_neon, vpx_d207_predictor_16x16_neon,
+                vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
                 vpx_dc_top_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
                 vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
-                vpx_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_neon)
+                vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon,
+                vpx_d153_predictor_32x32_neon, vpx_d207_predictor_32x32_neon,
+                vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
 INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
                 vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
                 vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
-                vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_4x4_msa)
+                vpx_h_predictor_4x4_msa, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_4x4_msa)
 INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
                 vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
                 vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
-                vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_8x8_msa)
+                vpx_h_predictor_8x8_msa, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_8x8_msa)
 INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa,
                 vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa,
                 vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa,
-                vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_16x16_msa)
+                vpx_h_predictor_16x16_msa, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_16x16_msa)
 INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
                 vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa,
                 vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa,
-                vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_msa)
+                vpx_h_predictor_32x32_msa, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_32x32_msa)
 #endif  // HAVE_MSA
 
 #if HAVE_VSX
 // TODO(crbug.com/webm/1522): Fix test failures.
 #if 0
-INTRA_PRED_TEST(VSX, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_4x4_vsx, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_4x4_vsx)
-
-INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, NULL, NULL, NULL,
-                NULL, vpx_h_predictor_8x8_vsx, vpx_d45_predictor_8x8_vsx, NULL,
-                NULL, NULL, NULL, vpx_d63_predictor_8x8_vsx,
-                vpx_tm_predictor_8x8_vsx)
+INTRA_PRED_TEST(VSX, TestIntraPred4, nullptr, nullptr, nullptr, nullptr,
+                nullptr, vpx_h_predictor_4x4_vsx, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, vpx_tm_predictor_4x4_vsx)
+
+INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, nullptr, nullptr,
+                nullptr, nullptr, vpx_h_predictor_8x8_vsx,
+                vpx_d45_predictor_8x8_vsx, nullptr, nullptr, nullptr, nullptr,
+                vpx_d63_predictor_8x8_vsx, vpx_tm_predictor_8x8_vsx)
 #endif
 
 INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx,
                 vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx,
                 vpx_dc_128_predictor_16x16_vsx, vpx_v_predictor_16x16_vsx,
-                vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, NULL,
-                NULL, NULL, NULL, vpx_d63_predictor_16x16_vsx,
+                vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, nullptr,
+                nullptr, nullptr, nullptr, vpx_d63_predictor_16x16_vsx,
                 vpx_tm_predictor_16x16_vsx)
 
 INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx,
                 vpx_dc_left_predictor_32x32_vsx, vpx_dc_top_predictor_32x32_vsx,
                 vpx_dc_128_predictor_32x32_vsx, vpx_v_predictor_32x32_vsx,
-                vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, NULL,
-                NULL, NULL, NULL, vpx_d63_predictor_32x32_vsx,
+                vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, nullptr,
+                nullptr, nullptr, nullptr, vpx_d63_predictor_32x32_vsx,
                 vpx_tm_predictor_32x32_vsx)
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INTRA_PRED_TEST(LSX, TestIntraPred8, vpx_dc_predictor_8x8_lsx, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(LSX, TestIntraPred16, vpx_dc_predictor_16x16_lsx, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr)
+#endif  // HAVE_LSX
+
 // -----------------------------------------------------------------------------
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -361,7 +380,7 @@ void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs,
   intra_pred_test_mem.Init(block_size, 12);
 
   for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
-    if (pred_funcs[k] == NULL) continue;
+    if (pred_funcs[k] == nullptr) continue;
     memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
            sizeof(intra_pred_test_mem.src));
     vpx_usec_timer timer;
@@ -487,19 +506,17 @@ HIGHBD_INTRA_PRED_TEST(
     SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2,
     vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2,
     vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2,
-    vpx_highbd_h_predictor_4x4_sse2, NULL, vpx_highbd_d135_predictor_4x4_sse2,
-    vpx_highbd_d117_predictor_4x4_sse2, vpx_highbd_d153_predictor_4x4_sse2,
-    vpx_highbd_d207_predictor_4x4_sse2, vpx_highbd_d63_predictor_4x4_sse2,
-    vpx_highbd_tm_predictor_4x4_c)
-
-HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8,
-                       vpx_highbd_dc_predictor_8x8_sse2,
-                       vpx_highbd_dc_left_predictor_8x8_sse2,
-                       vpx_highbd_dc_top_predictor_8x8_sse2,
-                       vpx_highbd_dc_128_predictor_8x8_sse2,
-                       vpx_highbd_v_predictor_8x8_sse2,
-                       vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
-                       NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
+    vpx_highbd_h_predictor_4x4_sse2, nullptr,
+    vpx_highbd_d135_predictor_4x4_sse2, vpx_highbd_d117_predictor_4x4_sse2,
+    vpx_highbd_d153_predictor_4x4_sse2, vpx_highbd_d207_predictor_4x4_sse2,
+    vpx_highbd_d63_predictor_4x4_sse2, vpx_highbd_tm_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    SSE2, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_sse2,
+    vpx_highbd_dc_left_predictor_8x8_sse2, vpx_highbd_dc_top_predictor_8x8_sse2,
+    vpx_highbd_dc_128_predictor_8x8_sse2, vpx_highbd_v_predictor_8x8_sse2,
+    vpx_highbd_h_predictor_8x8_sse2, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, vpx_highbd_tm_predictor_8x8_sse2)
 
 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
                        vpx_highbd_dc_predictor_16x16_sse2,
@@ -507,8 +524,9 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
                        vpx_highbd_dc_top_predictor_16x16_sse2,
                        vpx_highbd_dc_128_predictor_16x16_sse2,
                        vpx_highbd_v_predictor_16x16_sse2,
-                       vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2)
+                       vpx_highbd_h_predictor_16x16_sse2, nullptr, nullptr,
+                       nullptr, nullptr, nullptr, nullptr,
+                       vpx_highbd_tm_predictor_16x16_sse2)
 
 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
                        vpx_highbd_dc_predictor_32x32_sse2,
@@ -516,35 +534,40 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
                        vpx_highbd_dc_top_predictor_32x32_sse2,
                        vpx_highbd_dc_128_predictor_32x32_sse2,
                        vpx_highbd_v_predictor_32x32_sse2,
-                       vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2)
+                       vpx_highbd_h_predictor_32x32_sse2, nullptr, nullptr,
+                       nullptr, nullptr, nullptr, nullptr,
+                       vpx_highbd_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL,
-                       NULL, NULL, vpx_highbd_d45_predictor_4x4_ssse3, NULL,
-                       NULL, NULL, NULL, NULL, NULL)
-HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL,
-                       NULL, NULL, vpx_highbd_d45_predictor_8x8_ssse3,
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, nullptr, nullptr, nullptr,
+                       nullptr, nullptr, nullptr,
+                       vpx_highbd_d45_predictor_4x4_ssse3, nullptr, nullptr,
+                       nullptr, nullptr, nullptr, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, nullptr, nullptr, nullptr,
+                       nullptr, nullptr, nullptr,
+                       vpx_highbd_d45_predictor_8x8_ssse3,
                        vpx_highbd_d135_predictor_8x8_ssse3,
                        vpx_highbd_d117_predictor_8x8_ssse3,
                        vpx_highbd_d153_predictor_8x8_ssse3,
                        vpx_highbd_d207_predictor_8x8_ssse3,
-                       vpx_highbd_d63_predictor_8x8_ssse3, NULL)
-HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL,
-                       NULL, NULL, vpx_highbd_d45_predictor_16x16_ssse3,
+                       vpx_highbd_d63_predictor_8x8_ssse3, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, nullptr, nullptr, nullptr,
+                       nullptr, nullptr, nullptr,
+                       vpx_highbd_d45_predictor_16x16_ssse3,
                        vpx_highbd_d135_predictor_16x16_ssse3,
                        vpx_highbd_d117_predictor_16x16_ssse3,
                        vpx_highbd_d153_predictor_16x16_ssse3,
                        vpx_highbd_d207_predictor_16x16_ssse3,
-                       vpx_highbd_d63_predictor_16x16_ssse3, NULL)
-HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL,
-                       NULL, NULL, vpx_highbd_d45_predictor_32x32_ssse3,
+                       vpx_highbd_d63_predictor_16x16_ssse3, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, nullptr, nullptr, nullptr,
+                       nullptr, nullptr, nullptr,
+                       vpx_highbd_d45_predictor_32x32_ssse3,
                        vpx_highbd_d135_predictor_32x32_ssse3,
                        vpx_highbd_d117_predictor_32x32_ssse3,
                        vpx_highbd_d153_predictor_32x32_ssse3,
                        vpx_highbd_d207_predictor_32x32_ssse3,
-                       vpx_highbd_d63_predictor_32x32_ssse3, NULL)
+                       vpx_highbd_d63_predictor_32x32_ssse3, nullptr)
 #endif  // HAVE_SSSE3
 
 #if HAVE_NEON
@@ -553,37 +576,41 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
-    vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
-    vpx_highbd_tm_predictor_4x4_neon)
+    vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon,
+    vpx_highbd_d153_predictor_4x4_neon, vpx_highbd_d207_predictor_4x4_neon,
+    vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
     vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
-    vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
-    vpx_highbd_tm_predictor_8x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
-                       vpx_highbd_dc_predictor_16x16_neon,
-                       vpx_highbd_dc_left_predictor_16x16_neon,
-                       vpx_highbd_dc_top_predictor_16x16_neon,
-                       vpx_highbd_dc_128_predictor_16x16_neon,
-                       vpx_highbd_v_predictor_16x16_neon,
-                       vpx_highbd_h_predictor_16x16_neon,
-                       vpx_highbd_d45_predictor_16x16_neon,
-                       vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
-                       NULL, vpx_highbd_tm_predictor_16x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
-                       vpx_highbd_dc_predictor_32x32_neon,
-                       vpx_highbd_dc_left_predictor_32x32_neon,
-                       vpx_highbd_dc_top_predictor_32x32_neon,
-                       vpx_highbd_dc_128_predictor_32x32_neon,
-                       vpx_highbd_v_predictor_32x32_neon,
-                       vpx_highbd_h_predictor_32x32_neon,
-                       vpx_highbd_d45_predictor_32x32_neon,
-                       vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
-                       NULL, vpx_highbd_tm_predictor_32x32_neon)
+    vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon,
+    vpx_highbd_d153_predictor_8x8_neon, vpx_highbd_d207_predictor_8x8_neon,
+    vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
+    vpx_highbd_dc_left_predictor_16x16_neon,
+    vpx_highbd_dc_top_predictor_16x16_neon,
+    vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
+    vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
+    vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon,
+    vpx_highbd_d153_predictor_16x16_neon, vpx_highbd_d207_predictor_16x16_neon,
+    vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
+    vpx_highbd_dc_left_predictor_32x32_neon,
+    vpx_highbd_dc_top_predictor_32x32_neon,
+    vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
+    vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
+    vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon,
+    vpx_highbd_d153_predictor_32x32_neon, vpx_highbd_d207_predictor_32x32_neon,
+    vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#include "test/test_libvpx.cc"
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ::libvpx_test::init_vpx_test();
+  return RUN_ALL_TESTS();
+}
diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index 222a83f8c7..c1798b8b8b 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -7,69 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <string>
 
+#include "test/init_vpx_test.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./vpx_config.h"
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
-#include "vpx_ports/x86.h"
-#endif
-extern "C" {
-#if CONFIG_VP8
-extern void vp8_rtcd();
-#endif  // CONFIG_VP8
-#if CONFIG_VP9
-extern void vp9_rtcd();
-#endif  // CONFIG_VP9
-extern void vpx_dsp_rtcd();
-extern void vpx_scale_rtcd();
-}
-
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
-static void append_negative_gtest_filter(const char *str) {
-  std::string filter = ::testing::FLAGS_gtest_filter;
-  // Negative patterns begin with one '-' followed by a ':' separated list.
-  if (filter.find('-') == std::string::npos) filter += '-';
-  filter += str;
-  ::testing::FLAGS_gtest_filter = filter;
-}
-#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
-
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
-
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
-  const int simd_caps = x86_simd_caps();
-  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
-  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*");
-  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*");
-  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*");
-  if (!(simd_caps & HAS_SSSE3)) {
-    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
-  }
-  if (!(simd_caps & HAS_SSE4_1)) {
-    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
-  }
-  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
-  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
-  if (!(simd_caps & HAS_AVX512)) {
-    append_negative_gtest_filter(":AVX512.*:AVX512/*");
-  }
-#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
-
-#if !CONFIG_SHARED
-// Shared library builds don't support whitebox tests
-// that exercise internal symbols.
-#if CONFIG_VP8
-  vp8_rtcd();
-#endif  // CONFIG_VP8
-#if CONFIG_VP9
-  vp9_rtcd();
-#endif  // CONFIG_VP9
-  vpx_dsp_rtcd();
-  vpx_scale_rtcd();
-#endif  // !CONFIG_SHARED
-
+  ::libvpx_test::init_vpx_test();
   return RUN_ALL_TESTS();
 }
diff --git a/test/test_rc_interface.cc b/test/test_rc_interface.cc
new file mode 100644
index 0000000000..ec75700f73
--- /dev/null
+++ b/test/test_rc_interface.cc
@@ -0,0 +1,6 @@
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 6adcb221ad..ee552113ce 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -40,7 +40,7 @@ typedef std::tuple<int, int, const char *> DecodeParam;
 class TestVectorTest : public ::libvpx_test::DecoderTest,
                        public ::libvpx_test::CodecTestWithParam<DecodeParam> {
  protected:
-  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {
+  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(nullptr) {
 #if CONFIG_VP9_DECODER
     resize_clips_.insert(::libvpx_test::kVP9TestVectorsResize,
                          ::libvpx_test::kVP9TestVectorsResize +
@@ -48,20 +48,19 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 #endif
   }
 
-  virtual ~TestVectorTest() {
+  ~TestVectorTest() override {
     if (md5_file_) fclose(md5_file_);
   }
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL)
+    ASSERT_NE(md5_file_, nullptr)
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
 #if CONFIG_VP9_DECODER
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (video.frame_number() == 0 && mt_mode_ >= 0) {
       if (mt_mode_ == 1) {
         decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1);
@@ -77,9 +76,9 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
   }
 #endif
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
-    ASSERT_TRUE(md5_file_ != NULL);
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
+    ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
 
@@ -137,7 +136,7 @@ TEST_P(TestVectorTest, MD5Match) {
     return;
 #endif
   }
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
 
   // Construct md5 file name.
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index d92c13f88e..dab6e531b7 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -36,18 +36,18 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1);
   }
 
-  virtual ~TileIndependenceTest() {
+  ~TileIndependenceTest() override {
     delete fw_dec_;
     delete inv_dec_;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_);
     }
@@ -65,7 +65,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     md5->Add(img);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
     UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
   }
diff --git a/test/timestamp_test.cc b/test/timestamp_test.cc
index 645a9f2ff8..00abf8f31c 100644
--- a/test/timestamp_test.cc
+++ b/test/timestamp_test.cc
@@ -42,16 +42,16 @@ class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource {
            (static_cast<double>(framerate_numerator_) / framerate_denominator_);
   }
 
-  virtual vpx_codec_pts_t pts() const {
+  vpx_codec_pts_t pts() const override {
     return static_cast<vpx_codec_pts_t>(frame_ * FrameDuration() +
                                         starting_pts_ + 0.5);
   }
 
-  virtual unsigned long duration() const {
+  unsigned long duration() const override {
     return static_cast<unsigned long>(FrameDuration() + 0.5);
   }
 
-  virtual vpx_rational_t timebase() const { return timebase_; }
+  vpx_rational_t timebase() const override { return timebase_; }
 
   void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
 
@@ -67,9 +67,9 @@ class TimestampTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   TimestampTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~TimestampTest() {}
+  ~TimestampTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 844a12534d..d0dd24df36 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -133,7 +133,7 @@ vpx_config_option_enabled() {
   vpx_config_option="${1}"
   vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h"
   config_line=$(grep "${vpx_config_option}" "${vpx_config_file}")
-  if echo "${config_line}" | egrep -q '1$'; then
+  if echo "${config_line}" | grep -E -q '1$'; then
     echo yes
   fi
 }
@@ -222,7 +222,7 @@ filter_strings() {
 
   if [ -n "${filter}" ]; then
     for s in ${strings}; do
-      if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then
+      if echo "${s}" | grep -E -q ${exclude} "${filter}" > /dev/null 2>&1; then
         filtered_strings="${filtered_strings} ${s}"
       fi
     done
@@ -280,7 +280,12 @@ run_tests() {
     test_end "${test}"
   done
 
-  local tested_config="$(test_configuration_target) @ $(current_hash)"
+  # C vs SIMD tests are run for x86 32-bit, 64-bit and ARM platform
+  if [ "${test_name}" = "vp9_c_vs_simd_encode" ]; then
+    local tested_config="$(current_hash)"
+  else
+    local tested_config="$(test_configuration_target) @ $(current_hash)"
+  fi
   echo "${test_name}: Done, all tests pass for ${tested_config}."
 }
 
diff --git a/test/user_priv_test.cc b/test/user_priv_test.cc
index 7bea76b0a9..20741f8268 100644
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -57,18 +57,18 @@ string DecodeFile(const string &filename) {
     void *user_priv = reinterpret_cast<void *>(&frame_num);
     const vpx_codec_err_t res =
         decoder.DecodeFrame(video.cxdata(), video.frame_size(),
-                            (frame_num == 0) ? NULL : user_priv);
+                            (frame_num == 0) ? nullptr : user_priv);
     if (res != VPX_CODEC_OK) {
       EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
       break;
     }
     libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img = NULL;
+    const vpx_image_t *img = nullptr;
 
     // Get decompressed data.
     while ((img = dec_iter.Next())) {
       if (frame_num == 0) {
-        CheckUserPrivateData(img->user_priv, NULL);
+        CheckUserPrivateData(img->user_priv, nullptr);
       } else {
         CheckUserPrivateData(img->user_priv, &frame_num);
 
@@ -78,7 +78,7 @@ string DecodeFile(const string &filename) {
         ref.idx = rnd.Rand8() % 3;
         decoder.Control(VP9_GET_REFERENCE, &ref);
 
-        CheckUserPrivateData(ref.img.user_priv, NULL);
+        CheckUserPrivateData(ref.img.user_priv, nullptr);
       }
       md5.Add(img);
     }
diff --git a/test/variance_test.cc b/test/variance_test.cc
index a6db9cff42..b8320e9ceb 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -210,7 +210,7 @@ class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
  public:
   SumOfSquaresTest() : func_(GetParam()) {}
 
-  virtual ~SumOfSquaresTest() { libvpx_test::ClearSystemState(); }
+  ~SumOfSquaresTest() override { libvpx_test::ClearSystemState(); }
 
  protected:
   void ConstTest();
@@ -253,7 +253,7 @@ void SumOfSquaresTest::RefTest() {
 
 template <typename Func>
 struct TestParams {
-  TestParams(int log2w = 0, int log2h = 0, Func function = NULL,
+  TestParams(int log2w = 0, int log2h = 0, Func function = nullptr,
              int bit_depth_value = 0)
       : log2width(log2w), log2height(log2h), func(function) {
     use_high_bit_depth = (bit_depth_value > 0);
@@ -289,7 +289,7 @@ template <typename FunctionType>
 class MainTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -297,8 +297,8 @@ class MainTestClass
         use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t);
     src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size() * unit));
     ref_ = new uint8_t[block_size() * unit];
-    ASSERT_TRUE(src_ != NULL);
-    ASSERT_TRUE(ref_ != NULL);
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(ref_, nullptr);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
@@ -308,7 +308,7 @@ class MainTestClass
 #endif
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
@@ -319,8 +319,8 @@ class MainTestClass
 
     vpx_free(src_);
     delete[] ref_;
-    src_ = NULL;
-    ref_ = NULL;
+    src_ = nullptr;
+    ref_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -488,8 +488,8 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
   }
   vpx_usec_timer_mark(&timer);
   const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
-  printf("Variance %dx%d time: %5d ms\n", width(), height(),
-         elapsed_time / 1000);
+  printf("Variance %dx%d %dbpp time: %5d ms\n", width(), height(),
+         params_.bit_depth, elapsed_time / 1000);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -499,14 +499,21 @@ template <typename FunctionType>
 void MainTestClass<FunctionType>::RefTestMse() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size(); ++j) {
-      src_[j] = rnd_.Rand8();
-      ref_[j] = rnd_.Rand8();
+      if (!use_high_bit_depth()) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
     }
     unsigned int sse1, sse2;
     const int stride = width();
     ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
     variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
-                 stride, &sse2, false, VPX_BITS_8);
+                 stride, &sse2, use_high_bit_depth(), params_.bit_depth);
     EXPECT_EQ(sse1, sse2);
   }
 }
@@ -530,8 +537,15 @@ void MainTestClass<FunctionType>::RefTestSse() {
 
 template <typename FunctionType>
 void MainTestClass<FunctionType>::MaxTestMse() {
-  memset(src_, 255, block_size());
-  memset(ref_, 0, block_size());
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 0, block_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
   unsigned int sse;
   ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
   const unsigned int expected = block_size() * 255 * 255;
@@ -554,7 +568,7 @@ template <typename FunctionType>
 class SubpelVarianceTest
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -573,12 +587,12 @@ class SubpelVarianceTest
           (block_size() + width() + height() + 1) * sizeof(uint16_t))));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-    ASSERT_TRUE(src_ != NULL);
-    ASSERT_TRUE(sec_ != NULL);
-    ASSERT_TRUE(ref_ != NULL);
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(sec_, nullptr);
+    ASSERT_NE(ref_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     if (!use_high_bit_depth()) {
       vpx_free(src_);
       vpx_free(sec_);
@@ -596,6 +610,7 @@ class SubpelVarianceTest
  protected:
   void RefTest();
   void ExtremeRefTest();
+  void SpeedTest();
 
   ACMRandom rnd_;
   uint8_t *src_;
@@ -681,6 +696,37 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
   }
 }
 
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() {
+  // The only interesting points are 0, 4, and anything else. To make the loops
+  // simple we will use 0, 2 and 4.
+  for (int x = 0; x <= 4; x += 2) {
+    for (int y = 0; y <= 4; y += 2) {
+      if (!use_high_bit_depth()) {
+        memset(src_, 25, block_size());
+        memset(ref_, 50, block_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), 25, block_size());
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 50, block_size());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+      unsigned int sse;
+      vpx_usec_timer timer;
+      vpx_usec_timer_start(&timer);
+      for (int i = 0; i < 1000000000 / block_size(); ++i) {
+        const uint32_t variance =
+            params_.func(ref_, width() + 1, x, y, src_, width(), &sse);
+        (void)variance;
+      }
+      vpx_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+      printf("SubpelVariance %dx%d xoffset: %d yoffset: %d time: %5d ms\n",
+             width(), height(), x, y, elapsed_time / 1000);
+    }
+  }
+}
+
 template <>
 void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
   for (int x = 0; x < 8; ++x) {
@@ -727,6 +773,7 @@ TEST_P(VpxSseTest, RefSse) { RefTestSse(); }
 TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); }
 TEST_P(VpxMseTest, RefMse) { RefTestMse(); }
 TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
 TEST_P(VpxVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
@@ -736,6 +783,7 @@ TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }
 
 INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
@@ -807,14 +855,11 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest;
 typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest;
 typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest;
 typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t>
     VpxHBDSubpelAvgVarianceTest;
 
-TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
-TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
 TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); }
 TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
@@ -824,22 +869,26 @@ TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 
-/* TODO(debargha): This test does not support the highbd version
+typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest;
+TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
+TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxHBDMseTest, DISABLED_Speed) { SpeedTest(); }
 INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDMseTest,
-    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c),
-                      MseParams(4, 4, &vpx_highbd_12_mse16x8_c),
-                      MseParams(4, 4, &vpx_highbd_12_mse8x16_c),
-                      MseParams(4, 4, &vpx_highbd_12_mse8x8_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x8_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse8x16_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse8x8_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x8_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse8x16_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse8x8_c)));
-*/
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12),
+                      MseParams(4, 3, &vpx_highbd_12_mse16x8_c, VPX_BITS_12),
+                      MseParams(3, 4, &vpx_highbd_12_mse8x16_c, VPX_BITS_12),
+                      MseParams(3, 3, &vpx_highbd_12_mse8x8_c, VPX_BITS_12),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c, VPX_BITS_10),
+                      MseParams(4, 3, &vpx_highbd_10_mse16x8_c, VPX_BITS_10),
+                      MseParams(3, 4, &vpx_highbd_10_mse8x16_c, VPX_BITS_10),
+                      MseParams(3, 3, &vpx_highbd_10_mse8x8_c, VPX_BITS_10),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c, VPX_BITS_8),
+                      MseParams(4, 3, &vpx_highbd_8_mse16x8_c, VPX_BITS_8),
+                      MseParams(3, 4, &vpx_highbd_8_mse8x16_c, VPX_BITS_8),
+                      MseParams(3, 3, &vpx_highbd_8_mse8x8_c, VPX_BITS_8)));
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest);
 
 INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDVarianceTest,
@@ -1104,22 +1153,15 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-/* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDMseTest,
-    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_12_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_12_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_10_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_10_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_8_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_8_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2)));
-*/
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2, VPX_BITS_12),
+        MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2, VPX_BITS_12),
+        MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2, VPX_BITS_10),
+        MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2, VPX_BITS_10),
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2, VPX_BITS_8)));
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDVarianceTest,
@@ -1387,7 +1429,10 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(5, 4, &vpx_variance32x16_avx2),
                       VarianceParams(4, 5, &vpx_variance16x32_avx2),
                       VarianceParams(4, 4, &vpx_variance16x16_avx2),
-                      VarianceParams(4, 3, &vpx_variance16x8_avx2)));
+                      VarianceParams(4, 3, &vpx_variance16x8_avx2),
+                      VarianceParams(3, 4, &vpx_variance8x16_avx2),
+                      VarianceParams(3, 3, &vpx_variance8x8_avx2),
+                      VarianceParams(3, 2, &vpx_variance8x4_avx2)));
 
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VpxSubpelVarianceTest,
@@ -1409,8 +1454,10 @@ INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest,
                                                      &vpx_get4x4sse_cs_neon)));
 
 INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest,
-                         ::testing::Values(MseParams(4, 4,
-                                                     &vpx_mse16x16_neon)));
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon),
+                                           MseParams(4, 3, &vpx_mse16x8_neon),
+                                           MseParams(3, 4, &vpx_mse8x16_neon),
+                                           MseParams(3, 3, &vpx_mse8x8_neon)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxVarianceTest,
@@ -1428,6 +1475,35 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_neon),
                       VarianceParams(2, 2, &vpx_variance4x4_neon)));
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxSseTest,
+    ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxMseTest,
+    ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon_dotprod),
+                      MseParams(4, 3, &vpx_mse16x8_neon_dotprod),
+                      MseParams(3, 4, &vpx_mse8x16_neon_dotprod),
+                      MseParams(3, 3, &vpx_mse8x8_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod),
+                      VarianceParams(6, 5, &vpx_variance64x32_neon_dotprod),
+                      VarianceParams(5, 6, &vpx_variance32x64_neon_dotprod),
+                      VarianceParams(5, 5, &vpx_variance32x32_neon_dotprod),
+                      VarianceParams(5, 4, &vpx_variance32x16_neon_dotprod),
+                      VarianceParams(4, 5, &vpx_variance16x32_neon_dotprod),
+                      VarianceParams(4, 4, &vpx_variance16x16_neon_dotprod),
+                      VarianceParams(4, 3, &vpx_variance16x8_neon_dotprod),
+                      VarianceParams(3, 4, &vpx_variance8x16_neon_dotprod),
+                      VarianceParams(3, 3, &vpx_variance8x8_neon_dotprod),
+                      VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod),
+                      VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod),
+                      VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod)));
+#endif  // HAVE_NEON_DOTPROD
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxSubpelVarianceTest,
     ::testing::Values(
@@ -1461,6 +1537,282 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
         SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_12_mse16x16_neon, VPX_BITS_12),
+        MseParams(4, 3, &vpx_highbd_12_mse16x8_neon, VPX_BITS_12),
+        MseParams(3, 4, &vpx_highbd_12_mse8x16_neon, VPX_BITS_12),
+        MseParams(3, 3, &vpx_highbd_12_mse8x8_neon, VPX_BITS_12),
+        MseParams(4, 4, &vpx_highbd_10_mse16x16_neon, VPX_BITS_10),
+        MseParams(4, 3, &vpx_highbd_10_mse16x8_neon, VPX_BITS_10),
+        MseParams(3, 4, &vpx_highbd_10_mse8x16_neon, VPX_BITS_10),
+        MseParams(3, 3, &vpx_highbd_10_mse8x8_neon, VPX_BITS_10),
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
+
+// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can
+// be used again.
+#if 0
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
+#endif  // HAVE_NEON_DOTPROD
+#endif  // 0
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDVarianceTest,
+    ::testing::Values(
+        VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12),
+        VarianceParams(6, 5, &vpx_highbd_12_variance64x32_neon, 12),
+        VarianceParams(5, 6, &vpx_highbd_12_variance32x64_neon, 12),
+        VarianceParams(5, 5, &vpx_highbd_12_variance32x32_neon, 12),
+        VarianceParams(5, 4, &vpx_highbd_12_variance32x16_neon, 12),
+        VarianceParams(4, 5, &vpx_highbd_12_variance16x32_neon, 12),
+        VarianceParams(4, 4, &vpx_highbd_12_variance16x16_neon, 12),
+        VarianceParams(4, 3, &vpx_highbd_12_variance16x8_neon, 12),
+        VarianceParams(3, 4, &vpx_highbd_12_variance8x16_neon, 12),
+        VarianceParams(3, 3, &vpx_highbd_12_variance8x8_neon, 12),
+        VarianceParams(3, 2, &vpx_highbd_12_variance8x4_neon, 12),
+        VarianceParams(2, 3, &vpx_highbd_12_variance4x8_neon, 12),
+        VarianceParams(2, 2, &vpx_highbd_12_variance4x4_neon, 12),
+        VarianceParams(6, 6, &vpx_highbd_10_variance64x64_neon, 10),
+        VarianceParams(6, 5, &vpx_highbd_10_variance64x32_neon, 10),
+        VarianceParams(5, 6, &vpx_highbd_10_variance32x64_neon, 10),
+        VarianceParams(5, 5, &vpx_highbd_10_variance32x32_neon, 10),
+        VarianceParams(5, 4, &vpx_highbd_10_variance32x16_neon, 10),
+        VarianceParams(4, 5, &vpx_highbd_10_variance16x32_neon, 10),
+        VarianceParams(4, 4, &vpx_highbd_10_variance16x16_neon, 10),
+        VarianceParams(4, 3, &vpx_highbd_10_variance16x8_neon, 10),
+        VarianceParams(3, 4, &vpx_highbd_10_variance8x16_neon, 10),
+        VarianceParams(3, 3, &vpx_highbd_10_variance8x8_neon, 10),
+        VarianceParams(3, 2, &vpx_highbd_10_variance8x4_neon, 10),
+        VarianceParams(2, 3, &vpx_highbd_10_variance4x8_neon, 10),
+        VarianceParams(2, 2, &vpx_highbd_10_variance4x4_neon, 10),
+        VarianceParams(6, 6, &vpx_highbd_8_variance64x64_neon, 8),
+        VarianceParams(6, 5, &vpx_highbd_8_variance64x32_neon, 8),
+        VarianceParams(5, 6, &vpx_highbd_8_variance32x64_neon, 8),
+        VarianceParams(5, 5, &vpx_highbd_8_variance32x32_neon, 8),
+        VarianceParams(5, 4, &vpx_highbd_8_variance32x16_neon, 8),
+        VarianceParams(4, 5, &vpx_highbd_8_variance16x32_neon, 8),
+        VarianceParams(4, 4, &vpx_highbd_8_variance16x16_neon, 8),
+        VarianceParams(4, 3, &vpx_highbd_8_variance16x8_neon, 8),
+        VarianceParams(3, 4, &vpx_highbd_8_variance8x16_neon, 8),
+        VarianceParams(3, 3, &vpx_highbd_8_variance8x8_neon, 8),
+        VarianceParams(3, 2, &vpx_highbd_8_variance8x4_neon, 8),
+        VarianceParams(2, 3, &vpx_highbd_8_variance4x8_neon, 8),
+        VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_neon,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_neon,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_neon,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_neon,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_neon,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_neon,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_neon,
+                             12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_neon,
+                             12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_neon,
+                             12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
+                             12),
+        SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon,
+                             12),
+        SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon,
+                             12),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_neon,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_neon,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_neon,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_neon,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_neon,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_neon,
+                             10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_neon,
+                             10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_neon,
+                             10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
+                             10),
+        SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon,
+                             10),
+        SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon,
+                             10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
+                             8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
+                             8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_neon,
+                             8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_neon,
+                             8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_neon,
+                             8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_neon,
+                             8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_neon,
+                             8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_neon,
+                             8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
+                             8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8),
+        SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8),
+        SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon,
+                             8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_neon,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
+                                12),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x8_neon,
+                                12),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x4_neon,
+                                12),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
+                                10),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x8_neon,
+                                10),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x4_neon,
+                                10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
+                                8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x8_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x8_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x4_neon,
+                                8)));
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -1615,4 +1967,27 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0),
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0)));
 #endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_lsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx),
+                      VarianceParams(5, 5, &vpx_variance32x32_lsx),
+                      VarianceParams(4, 4, &vpx_variance16x16_lsx),
+                      VarianceParams(3, 3, &vpx_variance8x8_lsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_lsx, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_lsx, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_lsx, 0)));
+
+INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelAvgVarianceTest,
+                         ::testing::Values(SubpelAvgVarianceParams(
+                             6, 6, &vpx_sub_pixel_avg_variance64x64_lsx, 0)));
+#endif
 }  // namespace
diff --git a/test/video_source.h b/test/video_source.h
index e9340f21e9..2194126f1f 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -20,8 +20,14 @@
 #endif
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
+#include <memory>
 #include <string>
+
 #include "test/acm_random.h"
+#if !defined(_WIN32)
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#endif
 #include "vpx/vpx_encoder.h"
 
 namespace libvpx_test {
@@ -36,7 +42,7 @@ namespace libvpx_test {
 // A simple function to encapsulate cross platform retrieval of test data path
 static std::string GetDataPath() {
   const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH");
-  if (data_path == NULL) {
+  if (data_path == nullptr) {
 #ifdef LIBVPX_TEST_DATA_PATH
     // In some environments, we cannot set environment variables
     // Instead, we set the data path by using a preprocessor symbol
@@ -58,7 +64,7 @@ inline FILE *OpenTestDataFile(const std::string &file_name) {
   return fopen(path_to_source.c_str(), "rb");
 }
 
-static FILE *GetTempOutFile(std::string *file_name) {
+static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) {
   file_name->clear();
 #if defined(_WIN32)
   char fname[MAX_PATH];
@@ -67,18 +73,37 @@ static FILE *GetTempOutFile(std::string *file_name) {
     // Assume for now that the filename generated is unique per process
     if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
       file_name->assign(fname);
-      return fopen(fname, "wb+");
+      return fopen(fname, io_mode);
     }
   }
-  return NULL;
+  return nullptr;
 #else
-  return tmpfile();
+  std::string temp_dir = testing::TempDir();
+  if (temp_dir.empty()) return nullptr;
+  // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may
+  // use the value of an environment variable without checking for a trailing
+  // path delimiter.
+  if (temp_dir[temp_dir.size() - 1] != '/') temp_dir += '/';
+  const char name_template[] = "libvpxtest.XXXXXX";
+  std::unique_ptr<char[]> temp_file_name(
+      new char[temp_dir.size() + sizeof(name_template)]);
+  if (temp_file_name == nullptr) return nullptr;
+  memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size());
+  memcpy(temp_file_name.get() + temp_dir.size(), name_template,
+         sizeof(name_template));
+  const int fd = mkstemp(temp_file_name.get());
+  if (fd == -1) return nullptr;
+  *file_name = temp_file_name.get();
+  return fdopen(fd, io_mode);
 #endif
 }
 
 class TempOutFile {
  public:
-  TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+  TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); }
+  TempOutFile(const char *io_mode) {
+    file_ = GetTempOutFile(&file_name_, io_mode);
+  }
   ~TempOutFile() {
     CloseFile();
     if (!file_name_.empty()) {
@@ -92,7 +117,7 @@ class TempOutFile {
   void CloseFile() {
     if (file_) {
       fclose(file_);
-      file_ = NULL;
+      file_ = nullptr;
     }
   }
   FILE *file_;
@@ -111,7 +136,7 @@ class VideoSource {
   // Advance the cursor to the next frame
   virtual void Next() = 0;
 
-  // Get the current video frame, or NULL on End-Of-Stream.
+  // Get the current video frame, or nullptr on End-Of-Stream.
   virtual vpx_image_t *img() const = 0;
 
   // Get the presentation timestamp of the current frame.
@@ -133,38 +158,40 @@ class VideoSource {
 class DummyVideoSource : public VideoSource {
  public:
   DummyVideoSource()
-      : img_(NULL), limit_(100), width_(80), height_(64),
+      : img_(nullptr), limit_(100), width_(80), height_(64),
         format_(VPX_IMG_FMT_I420) {
     ReallocImage();
   }
 
-  virtual ~DummyVideoSource() { vpx_img_free(img_); }
+  ~DummyVideoSource() override { vpx_img_free(img_); }
 
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  vpx_image_t *img() const override {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { 1, 30 };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   void set_limit(unsigned int limit) { limit_ = limit; }
 
@@ -190,8 +217,9 @@ class DummyVideoSource : public VideoSource {
 
   void ReallocImage() {
     vpx_img_free(img_);
-    img_ = vpx_img_alloc(NULL, format_, width_, height_, 32);
-    raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
+    img_ = vpx_img_alloc(nullptr, format_, width_, height_, 32);
+    ASSERT_NE(img_, nullptr);
+    raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8;
   }
 
   vpx_image_t *img_;
@@ -210,7 +238,7 @@ class RandomVideoSource : public DummyVideoSource {
 
  protected:
   // Reset the RNG to get a matching stream for the second pass
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     rnd_.Reset(seed_);
     FillFrame();
@@ -218,7 +246,7 @@ class RandomVideoSource : public DummyVideoSource {
 
   // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
   // than holding previous frames to encourage keyframes to be thrown.
-  virtual void FillFrame() {
+  void FillFrame() override {
     if (img_) {
       if (frame_ % 30 < 15) {
         for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8();
diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc
index dcd68a2d4c..aee27af66e 100644
--- a/test/vp8_datarate_test.cc
+++ b/test/vp8_datarate_test.cc
@@ -24,10 +24,10 @@ class DatarateTestLarge
  public:
   DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
 
-  virtual ~DatarateTestLarge() {}
+  ~DatarateTestLarge() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
@@ -47,8 +47,8 @@ class DatarateTestLarge
     use_roi_ = false;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
@@ -74,7 +74,7 @@ class DatarateTestLarge
     duration_ = 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
 
@@ -121,7 +121,7 @@ class DatarateTestLarge
     ++frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  void EndPassHook() override {
     if (bits_total_) {
       const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
 
@@ -301,7 +301,7 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
 
 class DatarateTestRealTime : public DatarateTestLarge {
  public:
-  virtual ~DatarateTestRealTime() {}
+  ~DatarateTestRealTime() override = default;
 };
 
 #if CONFIG_TEMPORAL_DENOISING
diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc
index 0197f143f3..7fa867d8bb 100644
--- a/test/vp8_denoiser_sse2_test.cc
+++ b/test/vp8_denoiser_sse2_test.cc
@@ -30,17 +30,22 @@ namespace {
 const int kNumPixels = 16 * 16;
 class VP8DenoiserTest : public ::testing::TestWithParam<int> {
  public:
-  virtual ~VP8DenoiserTest() {}
+  ~VP8DenoiserTest() override = default;
 
-  virtual void SetUp() { increase_denoising_ = GetParam(); }
+  void SetUp() override { increase_denoising_ = GetParam(); }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int increase_denoising_;
 };
 
+// TODO(https://crbug.com/webm/1718): This test fails with gcc 8-10.
+#if defined(__GNUC__) && __GNUC__ >= 8
+TEST_P(VP8DenoiserTest, DISABLED_BitexactCheck) {
+#else
 TEST_P(VP8DenoiserTest, BitexactCheck) {
+#endif
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 4000;
   const int stride = 16;
@@ -87,7 +92,7 @@ TEST_P(VP8DenoiserTest, BitexactCheck) {
     // Check bitexactness.
     for (int h = 0; h < 16; ++h) {
       for (int w = 0; w < 16; ++w) {
-        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+        ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
       }
     }
 
@@ -103,7 +108,7 @@ TEST_P(VP8DenoiserTest, BitexactCheck) {
     // Check bitexactness.
     for (int h = 0; h < 16; ++h) {
       for (int w = 0; w < 16; ++w) {
-        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+        ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
       }
     }
   }
diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc
index d5ac253003..66d5c151c5 100644
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -74,7 +74,7 @@ using libvpx_test::ACMRandom;
 
 class FdctTest : public ::testing::TestWithParam<FdctFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     fdct_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
@@ -148,7 +148,7 @@ TEST_P(FdctTest, SignBiasCheck) {
 
   EXPECT_EQ(true, bias_acceptable)
       << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
-};
+}
 
 TEST_P(FdctTest, RoundTripErrorCheck) {
   int max_error = 0;
@@ -181,7 +181,7 @@ TEST_P(FdctTest, RoundTripErrorCheck) {
 
   EXPECT_GE(count_test_block, total_error)
       << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
-};
+}
 
 INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c));
 
@@ -203,4 +203,9 @@ INSTANTIATE_TEST_SUITE_P(MSA, FdctTest,
 INSTANTIATE_TEST_SUITE_P(MMI, FdctTest,
                          ::testing::Values(vp8_short_fdct4x4_mmi));
 #endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, FdctTest,
+                         ::testing::Values(vp8_short_fdct4x4_lsx));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/test/vp8_fragments_test.cc b/test/vp8_fragments_test.cc
index 6e5baf229d..01b4c2120e 100644
--- a/test/vp8_fragments_test.cc
+++ b/test/vp8_fragments_test.cc
@@ -17,9 +17,9 @@ class VP8FragmentsTest : public ::libvpx_test::EncoderTest,
                          public ::testing::Test {
  protected:
   VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
-  virtual ~VP8FragmentsTest() {}
+  ~VP8FragmentsTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     const unsigned long init_flags =  // NOLINT(runtime/int)
         VPX_CODEC_USE_OUTPUT_PARTITION;
     InitializeConfig();
diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
new file mode 100644
index 0000000000..50478f7635
--- /dev/null
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -0,0 +1,417 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <fstream>  // NOLINT
+#include <string>
+
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "vp8/vp8_ratectrl_rtc.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+struct Vp8RCTestVideo {
+  Vp8RCTestVideo() = default;
+  Vp8RCTestVideo(const char *name_, int width_, int height_,
+                 unsigned int frames_)
+      : name(name_), width(width_), height(height_), frames(frames_) {}
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const Vp8RCTestVideo &video) {
+    os << video.name << " " << video.width << " " << video.height << " "
+       << video.frames;
+    return os;
+  }
+  const char *name;
+  int width;
+  int height;
+  unsigned int frames;
+};
+
+const Vp8RCTestVideo kVp8RCTestVectors[] = {
+  Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470),
+  Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300),
+  Vp8RCTestVideo("hantro_collage_w352h288.yuv", 352, 288, 100),
+};
+
+class Vp8RcInterfaceTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, Vp8RCTestVideo> {
+ public:
+  Vp8RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false),
+        frame_drop_thresh_(0) {}
+  ~Vp8RcInterfaceTest() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  // From error_resilience_test.cc
+  int SetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags =
+            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L, update L.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                      VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G,  update G.
+        frame_flags =
+            VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARG.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+        layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (rc_cfg_.ts_number_layers > 1) {
+      const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
+      const int frame_flags =
+          SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      frame_params_.temporal_layer_id = layer_id;
+      if (video->frame() > 0) {
+        encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+        encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags);
+      }
+    } else {
+      if (video->frame() == 0) {
+        encoder->Control(VP8E_SET_CPUUSED, -6);
+        encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
+        encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+        if (rc_cfg_.is_screen) {
+          encoder->Control(VP8E_SET_SCREEN_CONTENT_MODE, 1);
+        }
+      } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
+        // Disable golden frame update.
+        frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+        frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+      }
+    }
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
+    encoder_exit_ = video->frame() == test_video_.frames;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    if (encoder_exit_) {
+      return;
+    }
+    int qp;
+    encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+    } else {
+      num_drops_++;
+    }
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+  }
+
+  void RunOneLayer() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerScreen() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    SetConfig();
+    rc_cfg_.is_screen = true;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerDropFrames() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    frame_drop_thresh_ = 30;
+    num_drops_ = 0;
+    // Use lower target_bitrate and max_quantizer to trigger drops.
+    target_bitrate_ = target_bitrate_ >> 2;
+    SetConfig();
+    rc_cfg_.max_quantizer = 56;
+    cfg_.rc_max_quantizer = 56;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
+  void RunPeriodicKey() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    key_interval_ = 100;
+    frame_drop_thresh_ = 30;
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers2TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    SetConfigTemporalLayers(2);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers3TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    SetConfigTemporalLayers(3);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers3TLDropFrames() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    frame_drop_thresh_ = 30;
+    num_drops_ = 0;
+    // Use lower target_bitrate and max_quantizer to trigger drops.
+    target_bitrate_ = target_bitrate_ >> 2;
+    SetConfigTemporalLayers(3);
+    rc_cfg_.max_quantizer = 56;
+    cfg_.rc_max_quantizer = 56;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
+ private:
+  void SetConfig() {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.layer_target_bitrate[0] = target_bitrate_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
+  }
+
+  void SetConfigTemporalLayers(int temporal_layers) {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+    if (temporal_layers == 2) {
+      rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 2;
+      rc_cfg_.ts_rate_decimator[1] = 1;
+    } else if (temporal_layers == 3) {
+      rc_cfg_.layer_target_bitrate[0] = 40 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[2] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 4;
+      rc_cfg_.ts_rate_decimator[1] = 2;
+      rc_cfg_.ts_rate_decimator[2] = 1;
+    }
+
+    rc_cfg_.ts_number_layers = temporal_layers;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
+    // 2 Temporal layers, no spatial layers, CBR mode.
+    cfg_.ss_number_layers = 1;
+    cfg_.ts_number_layers = temporal_layers;
+    if (temporal_layers == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.ts_periodicity = 2;
+      cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+    } else if (temporal_layers == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.ts_periodicity = 4;
+      cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+    }
+  }
+
+  std::unique_ptr<libvpx::VP8RateControlRTC> rc_api_;
+  libvpx::VP8RateControlRtcConfig rc_cfg_;
+  int key_interval_;
+  int target_bitrate_;
+  Vp8RCTestVideo test_video_;
+  libvpx::VP8FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+  int frame_drop_thresh_;
+  int num_drops_;
+};
+
+TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(Vp8RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
+TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); }
+
+TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers3TLDropFrames) {
+  RunTemporalLayers3TLDropFrames();
+}
+
+VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest,
+                           ::testing::Values(200, 400, 1000),
+                           ::testing::ValuesIn(kVp8RCTestVectors));
+
+}  // namespace
diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index c7e6f1af02..3882326d2f 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc
@@ -86,9 +86,9 @@ class ArfFreqTest
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
 
-  virtual ~ArfFreqTest() {}
+  ~ArfFreqTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(test_encode_param_.mode);
     if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
@@ -104,7 +104,7 @@ class ArfFreqTest
     dec_cfg_.threads = 4;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     min_run_ = ARF_NOT_SEEN;
     run_of_visible_frames_ = 0;
   }
@@ -126,7 +126,7 @@ class ArfFreqTest
     return frames;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
     const int frames = GetNumFramesInPkt(pkt);
     if (frames == 1) {
@@ -145,8 +145,8 @@ class ArfFreqTest
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc
index dc1ff49c68..0645341ac1 100644
--- a/test/vp9_block_error_test.cc
+++ b/test/vp9_block_error_test.cc
@@ -53,20 +53,21 @@ int64_t BlockError8BitWrapper(const tran_low_t *coeff,
 
 class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
  public:
-  virtual ~BlockErrorTest() {}
-  virtual void SetUp() {
+  ~BlockErrorTest() override = default;
+  void SetUp() override {
     error_block_op_ = GET_PARAM(0);
     ref_error_block_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   vpx_bit_depth_t bit_depth_;
   HBDBlockErrorFunc error_block_op_;
   HBDBlockErrorFunc ref_error_block_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlockErrorTest);
 
 TEST_P(BlockErrorTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -196,4 +197,22 @@ INSTANTIATE_TEST_SUITE_P(
                                  &BlockError8BitWrapper<vp9_block_error_c>,
                                  VPX_BITS_8)));
 #endif  // HAVE_AVX2
+
+#if HAVE_NEON
+const BlockErrorParam neon_block_error_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_10),
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_12),
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_8),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&BlockError8BitWrapper<vp9_block_error_neon>,
+             &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest,
+                         ::testing::ValuesIn(neon_block_error_tests));
+#endif  // HAVE_NEON
 }  // namespace
diff --git a/test/vp9_boolcoder_test.cc b/test/vp9_boolcoder_test.cc
index 0cafa6730e..6ba171a000 100644
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc
@@ -74,7 +74,7 @@ TEST(VP9, TestBitIO) {
         GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);
 
         vpx_reader br;
-        vpx_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
+        vpx_reader_init(&br, bw_buffer, kBufferSize, nullptr, nullptr);
         bit_rnd.Reset(random_seed);
         for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
new file mode 100755
index 0000000000..03843610dc
--- /dev/null
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -0,0 +1,420 @@
+#!/bin/sh
+##
+##  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This script checks the bit exactness between C and SIMD
+##  implementations of VP9 encoder.
+##
+. $(dirname $0)/tools_common.sh
+
+TEST_BITRATES="1600 6400"
+PRESETS="good rt"
+TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input"
+OUT_FILE_SUFFIX=".ivf"
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd "${SCRIPT_DIR}/.."; pwd)
+
+# Clips used in test.
+YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_480P_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_640_480_30.yuv"
+Y4M_360P_10BIT_INPUT="${LIBVPX_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m"
+Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
+# Number of frames to test.
+VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=20
+
+# Create a temporary directory for output files.
+if [ -n "${TMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+  VPX_TEST_TEMP_ROOT=/tmp
+fi
+
+VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$"
+
+if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \
+   [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then
+  echo "${0##*/}: Cannot create output directory, giving up."
+  echo "${0##*/}:   VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}"
+  exit 1
+fi
+
+elog() {
+  echo "$@" 1>&2
+}
+
+# Echoes path to $1 when it's executable and exists in ${VPX_TEST_OUTPUT_DIR},
+# or an empty string. Caller is responsible for testing the string once the
+# function returns.
+vp9_enc_tool_path() {
+  local target="$1"
+  local tool_path="${VPX_TEST_OUTPUT_DIR}/build_target_${target}/vpxenc"
+
+  if [ ! -x "${tool_path}" ]; then
+    tool_path=""
+  fi
+  echo "${tool_path}"
+}
+
+# Environment check: Make sure input and source directories are available.
+vp9_c_vs_simd_enc_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_720P_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -d "$LIBVPX_SOURCE_DIR" ]; then
+    elog "LIBVPX_SOURCE_DIR does not exist."
+    return 1
+  fi
+}
+
+# This is not needed since tools_common.sh does the same cleanup.
+# Keep the code here for our reference.
+# cleanup() {
+#   rm -rf  ${VPX_TEST_OUTPUT_DIR}
+# }
+
+# Echo VPX_SIMD_CAPS_MASK for different instruction set architecture.
+avx512f() {
+   echo "0x1FF"
+}
+
+avx2() {
+   echo "0x0FF"
+}
+
+sse4_1() {
+   echo "0x03F"
+}
+
+ssse3() {
+   echo "0x01F"
+}
+
+sse2() {
+   echo "0x007"
+}
+
+# Echo clip details to be used as input to vpxenc.
+yuv_raw_input() {
+  echo ""${YUV_RAW_INPUT}"
+       --width=352
+       --height=288
+       --bit-depth=8
+       --profile=0"
+}
+
+yuv_480p_raw_input() {
+  echo ""${YUV_480P_RAW_INPUT}"
+       --width=640
+       --height=480
+       --bit-depth=8
+       --profile=0"
+}
+
+y4m_720p_input() {
+  echo ""${Y4M_720P_INPUT}"
+       --bit-depth=8
+       --profile=0"
+}
+
+y4m_360p_10bit_input() {
+  echo ""${Y4M_360P_10BIT_INPUT}"
+       --bit-depth=10
+       --profile=2"
+}
+
+has_x86_isa_extn() {
+  instruction_set=$1
+  if ! grep -q "$instruction_set" /proc/cpuinfo; then
+    # This instruction_set is not supported.
+    return 1
+  fi
+  # This instruction_set is supported.
+  return 0
+}
+
+# Echo good encode params for use with VP9 encoder.
+vp9_encode_good_params() {
+  echo "--codec=vp9 \
+  --good \
+  --test-decode=fatal \
+  --ivf \
+  --threads=1 \
+  --static-thresh=0 \
+  --tile-columns=0 \
+  --end-usage=vbr \
+  --kf-max-dist=160 \
+  --kf-min-dist=0 \
+  --lag-in-frames=19 \
+  --max-q=63 \
+  --min-q=0 \
+  --passes=2 \
+  --undershoot-pct=100 \
+  --overshoot-pct=100 \
+  --verbose \
+  --auto-alt-ref=1 \
+  --drop-frame=0 \
+  --bias-pct=50 \
+  --minsection-pct=0 \
+  --maxsection-pct=2000 \
+  --arnr-maxframes=7 \
+  --arnr-strength=5 \
+  --sharpness=0 \
+  --frame-parallel=0"
+}
+
+# Echo realtime encode params for use with VP9 encoder.
+vp9_encode_rt_params() {
+  echo "--codec=vp9 \
+  --rt \
+  --test-decode=fatal \
+  --ivf \
+  --threads=1 \
+  --static-thresh=0 \
+  --tile-columns=0 \
+  --tile-rows=0 \
+  --end-usage=cbr \
+  --kf-max-dist=90000 \
+  --lag-in-frames=0 \
+  --max-q=58 \
+  --min-q=2 \
+  --passes=1 \
+  --undershoot-pct=50 \
+  --overshoot-pct=50 \
+  --verbose \
+  --row-mt=0 \
+  --buf-sz=1000 \
+  --buf-initial-sz=500 \
+  --buf-optimal-sz=600 \
+  --max-intra-rate=300 \
+  --resize-allowed=0 \
+  --noise-sensitivity=0 \
+  --aq-mode=3 \
+  --error-resilient=0"
+}
+
+# Configures for the given target in the
+# ${VPX_TEST_OUTPUT_DIR}/build_target_${target} directory.
+vp9_enc_build() {
+  local target=$1
+  local configure="$2"
+  local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target}
+  mkdir -p "$tmp_build_dir"
+  local save_dir="$PWD"
+  cd "$tmp_build_dir"
+
+  echo "Building target: ${target}"
+  local config_args="--disable-install-docs \
+             --enable-unit-tests \
+             --enable-debug \
+             --enable-postproc \
+             --enable-vp9-postproc \
+             --enable-vp9-temporal-denoising \
+             --enable-vp9-highbitdepth"
+
+  eval "$configure" --target="${target}" "${config_args}" ${devnull}
+  eval make -j$(nproc) ${devnull}
+  echo "Done building target: ${target}"
+  cd "${save_dir}"
+}
+
+compare_enc_output() {
+  local target=$1
+  local cpu=$2
+  local clip=$3
+  local bitrate=$4
+  local preset=$5
+  if ! diff -q ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+       ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then
+    elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset"
+    return 1
+  fi
+}
+
+vp9_enc_test() {
+  local encoder="$1"
+  local target=$2
+  if [ -z "$(vp9_enc_tool_path "${target}")" ]; then
+    elog "vpxenc not found. It must exist in ${VPX_TEST_OUTPUT_DIR}/build_target_${target} path"
+    return 1
+  fi
+
+  local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target}
+  local save_dir="$PWD"
+  cd "$tmp_build_dir"
+  for preset in ${PRESETS}; do
+    if [ "${preset}" = "good" ]; then
+      local max_cpu_used=5
+      local test_params=vp9_encode_good_params
+    elif [ "${preset}" = "rt" ]; then
+      local max_cpu_used=9
+      local test_params=vp9_encode_rt_params
+    else
+      elog "Invalid preset"
+      cd "${save_dir}"
+      return 1
+    fi
+
+    # Enable armv8 test for real-time only
+    if [ "${preset}" = "good" ] && [ "${target}" = "armv8-linux-gcc" ]; then
+      continue
+    fi
+
+    for cpu in $(seq 0 $max_cpu_used); do
+      for clip in ${TEST_CLIPS}; do
+        for bitrate in ${TEST_BITRATES}; do
+          eval "${encoder}" $($clip) $($test_params) \
+          "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \
+          "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \
+          ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+          ${devnull}
+
+          if [ "${target}" != "generic-gnu" ]; then
+            if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then
+              # Find the mismatch
+              cd "${save_dir}"
+              return 1
+            fi
+          fi
+        done
+      done
+    done
+  done
+  cd "${save_dir}"
+}
+
+vp9_test_generic() {
+  local configure="$LIBVPX_SOURCE_DIR/configure"
+  local target="generic-gnu"
+
+  echo "Build for: ${target}"
+  vp9_enc_build ${target} ${configure}
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  vp9_enc_test $encoder "${target}"
+}
+
+# This function encodes VP9 bitstream by enabling SSE2, SSSE3, SSE4_1, AVX2, AVX512f as there are
+# no functions with MMX, SSE, SSE3 and AVX specialization.
+# The value of environment variable 'VPX_SIMD_CAPS' controls enabling of different instruction
+# set extension optimizations. The value of the flag 'VPX_SIMD_CAPS' and the corresponding
+# instruction set extension optimization enabled are as follows:
+# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX
+#   1     1    1    1      1    1    1    1   1  -> 0x1FF -> Enable AVX512 and lower variants
+#   0     1    1    1      1    1    1    1   1  -> 0x0FF -> Enable AVX2 and lower variants
+#   0     0    1    1      1    1    1    1   1  -> 0x07F -> Enable AVX and lower variants
+#   0     0    0    1      1    1    1    1   1  -> 0x03F  -> Enable SSE4_1 and lower variants
+#   0     0    0    0      1    1    1    1   1  -> 0x01F  -> Enable SSSE3 and lower variants
+#   0     0    0    0      0    1    1    1   1  -> 0x00F  -> Enable SSE3 and lower variants
+#   0     0    0    0      0    0    1    1   1  -> 0x007  -> Enable SSE2 and lower variants
+#   0     0    0    0      0    0    0    1   1  -> 0x003  -> Enable SSE and lower variants
+#   0     0    0    0      0    0    0    0   1  -> 0x001  -> Enable MMX
+## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "VPX_SIMD_CAPS_MASK" as
+#  all x86_64 platforms implement sse2.
+vp9_test_x86() {
+  local arch=$1
+
+  if ! uname -m | grep -q "x86"; then
+    elog "Machine architecture is not x86 or x86_64"
+    return 0
+  fi
+
+  if [ $arch = "x86" ]; then
+    local target="x86-linux-gcc"
+  elif [ $arch = "x86_64" ]; then
+    local target="x86_64-linux-gcc"
+  fi
+
+  local x86_isa_variants="avx512f avx2 sse4_1 ssse3 sse2"
+  local configure="$LIBVPX_SOURCE_DIR/configure"
+
+  echo "Build for x86: ${target}"
+  vp9_enc_build ${target} ${configure}
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  for isa in $x86_isa_variants; do
+    # Note that if has_x86_isa_extn returns 1, it is false, and vice versa.
+    if ! has_x86_isa_extn $isa; then
+      echo "${isa} is not supported in this machine"
+      continue
+    fi
+    export VPX_SIMD_CAPS_MASK=$($isa)
+    if ! vp9_enc_test $encoder ${target}; then
+      # Find the mismatch
+      return 1
+    fi
+    unset VPX_SIMD_CAPS_MASK
+  done
+}
+
+vp9_test_arm() {
+  local target="armv8-linux-gcc"
+  local configure="CROSS=aarch64-linux-gnu- $LIBVPX_SOURCE_DIR/configure --extra-cflags=-march=armv8.4-a \
+          --extra-cxxflags=-march=armv8.4-a"
+  echo "Build for arm64: ${target}"
+  vp9_enc_build ${target} "${configure}"
+
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  if ! vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}; then
+    # Find the mismatch
+    return 1
+  fi
+}
+
+vp9_c_vs_simd_enc_test() {
+  # Test Generic
+  vp9_test_generic
+
+  # Test x86 (32 bit)
+  echo "vp9 test for x86 (32 bit): Started."
+  if ! vp9_test_x86 "x86"; then
+    echo "vp9 test for x86 (32 bit): Done, test failed."
+    return 1
+  else
+    echo "vp9 test for x86 (32 bit): Done, all tests passed."
+  fi
+
+  # Test x86_64 (64 bit)
+  if [ "$(eval uname -m)" = "x86_64" ]; then
+    echo "vp9 test for x86_64 (64 bit): Started."
+    if ! vp9_test_x86 "x86_64"; then
+      echo "vp9 test for x86_64 (64 bit): Done, test failed."
+      return 1
+    else
+      echo "vp9 test for x86_64 (64 bit): Done, all tests passed."
+    fi
+  fi
+
+  # Test ARM
+  echo "vp9_test_arm: Started."
+  if ! vp9_test_arm; then
+    echo "vp9 test for arm: Done, test failed."
+    return 1
+  else
+    echo "vp9 test for arm: Done, all tests passed."
+  fi
+}
+
+# Setup a trap function to clean up build, and output files after tests complete.
+# trap cleanup EXIT
+
+run_tests vp9_c_vs_simd_enc_verify_environment vp9_c_vs_simd_enc_test
diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index 7e4d08f111..4bc9099206 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -9,6 +9,7 @@
  */
 #include "./vpx_config.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -27,7 +28,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
   }
 
  protected:
-  virtual ~DatarateTestVP9() {}
+  ~DatarateTestVP9() override = default;
 
   virtual void ResetModel() {
     last_pts_ = 0;
@@ -112,8 +113,8 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     return layer_id;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
@@ -147,21 +148,23 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
       if (video->frame() == 0) {
         encoder->Control(VP9E_SET_SVC, 1);
       }
-      vpx_svc_layer_id_t layer_id;
-      layer_id.spatial_layer_id = 0;
-      frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id_per_spatial[0] =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+        vpx_svc_layer_id_t layer_id;
+        frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
+        layer_id.spatial_layer_id = 0;
+        layer_id.temporal_layer_id =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        layer_id.temporal_layer_id_per_spatial[0] =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      }
     }
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
     duration_ = 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
 
@@ -199,7 +202,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  void EndPassHook() override {
     for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
          ++layer) {
       duration_ = (last_pts_ + 1) * timebase_;
@@ -240,7 +243,7 @@ class DatarateTestVP9RealTimeMultiBR
   DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -256,7 +259,7 @@ class DatarateTestVP9LargeVBR
   DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -576,10 +579,10 @@ class DatarateTestVP9RealTime : public DatarateTestVP9,
                                 public ::libvpx_test::CodecTestWithParam<int> {
  public:
   DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {}
-  virtual ~DatarateTestVP9RealTime() {}
+  ~DatarateTestVP9RealTime() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -701,7 +704,7 @@ TEST_P(DatarateTestVP9RealTime, RegionOfInterest) {
   // Use 2 states: 1 is center square, 0 is the rest.
   roi_.roi_map = reinterpret_cast<uint8_t *>(
       calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)));
-  ASSERT_TRUE(roi_.roi_map != NULL);
+  ASSERT_NE(roi_.roi_map, nullptr);
 
   for (unsigned int i = 0; i < roi_.rows; ++i) {
     for (unsigned int j = 0; j < roi_.cols; ++j) {
@@ -728,10 +731,10 @@ class DatarateTestVP9RealTimeDeltaQUV
       public ::libvpx_test::CodecTestWith2Params<int, int> {
  public:
   DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {}
-  virtual ~DatarateTestVP9RealTimeDeltaQUV() {}
+  ~DatarateTestVP9RealTimeDeltaQUV() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -776,7 +779,7 @@ class DatarateTestVP9PostEncodeDrop
   DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -809,11 +812,140 @@ TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) {
       << " The datarate for the file is greater than target by too much!";
 }
 
+using libvpx_test::ACMRandom;
+
+class DatarateTestVP9FrameQp
+    : public DatarateTestVP9,
+      public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
+ public:
+  DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {}
+  ~DatarateTestVP9FrameQp() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    ResetModel();
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    set_cpu_used_ = 7;
+    DatarateTestVP9::PreEncodeFrameHook(video, encoder);
+    frame_qp_ = static_cast<int>(rnd_.RandRange(64));
+    encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_);
+    frame_++;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    int qp = 0;
+    vpx_svc_layer_id_t layer_id;
+    if (frame_ >= total_frame_) return;
+    encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp);
+    ASSERT_EQ(frame_qp_, qp);
+    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+    temporal_layer_id_ = layer_id.temporal_layer_id;
+  }
+
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
+    if (frame_ >= total_frame_) return;
+    ASSERT_TRUE(cfg_.temporal_layering_mode ==
+                    VP9E_TEMPORAL_LAYERING_MODE_0212 &&
+                temporal_layer_id_ == 2);
+  }
+
+ protected:
+  int total_frame_;
+
+ private:
+  ACMRandom rnd_;
+  int frame_qp_;
+  int frame_;
+  int temporal_layer_id_;
+};
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  cfg_.rc_target_bitrate = 200;
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_error_resilient = 1;
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 // Params: speed setting.
 class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
  public:
-  virtual ~DatarateTestVP9RealTimeDenoiser() {}
+  ~DatarateTestVP9RealTimeDenoiser() override = default;
 };
 
 // Check basic datarate targeting, for a single bitrate, when denoiser is on.
@@ -943,6 +1075,13 @@ VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9),
 
 VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10));
 
+#if CONFIG_VP9
+INSTANTIATE_TEST_SUITE_P(
+    VP9, DatarateTestVP9FrameQp,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+#endif
+
 VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV,
                            ::testing::Range(5, 10),
                            ::testing::Values(-5, -10, -15));
diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc
index 3d76edfaa2..831f83305c 100644
--- a/test/vp9_denoiser_test.cc
+++ b/test/vp9_denoiser_test.cc
@@ -42,15 +42,16 @@ class VP9DenoiserTest
     : public ::testing::Test,
       public ::testing::WithParamInterface<VP9DenoiserTestParam> {
  public:
-  virtual ~VP9DenoiserTest() {}
+  ~VP9DenoiserTest() override = default;
 
-  virtual void SetUp() { bs_ = GET_PARAM(1); }
+  void SetUp() override { bs_ = GET_PARAM(1); }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   BLOCK_SIZE bs_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9DenoiserTest);
 
 TEST_P(VP9DenoiserTest, BitexactCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
index 5286d7a85f..0e182c76db 100644
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -62,9 +62,9 @@ class VpxEncoderParmsGetToDecoder
   VpxEncoderParmsGetToDecoder()
       : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
 
-  virtual ~VpxEncoderParmsGetToDecoder() {}
+  ~VpxEncoderParmsGetToDecoder() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 25;
@@ -74,8 +74,8 @@ class VpxEncoderParmsGetToDecoder
     cfg_.rc_target_bitrate = test_video_.bitrate;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
       encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range);
@@ -95,9 +95,9 @@ class VpxEncoderParmsGetToDecoder
     }
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
     vpx_codec_alg_priv_t *const priv =
         reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
@@ -142,7 +142,7 @@ TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) {
 
   std::unique_ptr<libvpx_test::VideoSource> video(
       new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc
index 315ce219ed..79be4ee146 100644
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -31,7 +31,7 @@ const double kPsnrThreshold[][5] = {
   { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
   { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
   { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 28.0, 32.0, 32.0, 32.0, 32.0 },
-  { 28.5, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 },
+  { 28.4, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 },
 };
 
 typedef struct {
@@ -89,9 +89,9 @@ class EndToEndTestAdaptiveRDThresh
       : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)),
         cpu_used_end_(GET_PARAM(2)) {}
 
-  virtual ~EndToEndTestAdaptiveRDThresh() {}
+  ~EndToEndTestAdaptiveRDThresh() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     cfg_.g_lag_in_frames = 0;
@@ -102,8 +102,8 @@ class EndToEndTestAdaptiveRDThresh
     dec_cfg_.threads = 4;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_);
       encoder->Control(VP9E_SET_ROW_MT, 1);
@@ -131,9 +131,9 @@ class EndToEndTestLarge
     denoiser_on_ = 0;
   }
 
-  virtual ~EndToEndTestLarge() {}
+  ~EndToEndTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -149,18 +149,18 @@ class EndToEndTestLarge
     dec_cfg_.threads = 4;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
@@ -207,9 +207,9 @@ class EndToEndTestLoopFilterThreading
   EndToEndTestLoopFilterThreading()
       : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {}
 
-  virtual ~EndToEndTestLoopFilterThreading() {}
+  ~EndToEndTestLoopFilterThreading() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     cfg_.g_threads = 2;
@@ -221,16 +221,16 @@ class EndToEndTestLoopFilterThreading
     dec_cfg_.threads = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 8);
     }
     encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5);
   }
 
-  virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Decoder *decoder) override {
     if (video->frame() == 0) {
       decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0);
     }
@@ -257,7 +257,7 @@ TEST_P(EndToEndNV12, EndtoEndNV12Test) {
   video.reset(new libvpx_test::YUVVideoSource(test_video_param_.filename,
                                               test_video_param_.fmt, 352, 288,
                                               30, 1, 0, 100));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
@@ -280,7 +280,7 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
         test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
         kFramerate, 1, 0, kFrames));
   }
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
   const double psnr = GetAveragePsnr();
@@ -307,7 +307,7 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRDenoiserAQTest) {
         test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
         kFramerate, 1, 0, kFrames));
   }
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
   const double psnr = GetAveragePsnr();
@@ -342,7 +342,7 @@ VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLarge,
 VP9_INSTANTIATE_TEST_SUITE(EndToEndNV12,
                            ::testing::Values(::libvpx_test::kRealTime),
                            ::testing::ValuesIn(kTestVectorsNv12),
-                           ::testing::ValuesIn({ 6, 7, 8 }));
+                           ::testing::Values(6, 7, 8));
 
 VP9_INSTANTIATE_TEST_SUITE(EndToEndTestAdaptiveRDThresh,
                            ::testing::Values(5, 6, 7), ::testing::Values(8, 9));
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 1041dd78c0..c8d3cba7fb 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -21,12 +21,12 @@
 namespace {
 // FIRSTPASS_STATS struct:
 // {
-//   25 double members;
+//   26 double members;
 //   1 int64_t member;
 // }
 // Whenever FIRSTPASS_STATS struct is modified, the following constants need to
 // be revisited.
-const int kDbl = 25;
+const int kDbl = 26;
 const int kInt = 1;
 const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t);
 
@@ -41,12 +41,12 @@ class VPxFirstPassEncoderThreadTest
 
     row_mt_mode_ = 1;
     first_pass_only_ = true;
-    firstpass_stats_.buf = NULL;
+    firstpass_stats_.buf = nullptr;
     firstpass_stats_.sz = 0;
   }
-  virtual ~VPxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); }
+  ~VPxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -57,19 +57,19 @@ class VPxFirstPassEncoderThreadTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
     abort_ = false;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     // For first pass stats test, only run first pass encoder.
     if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS)
       abort_ |= first_pass_only_;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                          ::libvpx_test::Encoder *encoder) override {
     if (!encoder_initialized_) {
       // Encode in 2-pass mode.
       encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
@@ -87,7 +87,7 @@ class VPxFirstPassEncoderThreadTest
     }
   }
 
-  virtual void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     const uint8_t *const pkt_buf =
         reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
     const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -98,6 +98,7 @@ class VPxFirstPassEncoderThreadTest
 
     firstpass_stats_.buf =
         realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size);
+    ASSERT_NE(firstpass_stats_.buf, nullptr);
     memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf,
            pkt_size);
     firstpass_stats_.sz += pkt_size;
@@ -184,7 +185,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Compare to check if using or not using row-mt generates close stats.
-  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0));
 
   // Test single thread vs multiple threads
   row_mt_mode_ = 1;
@@ -198,7 +199,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Compare to check if single-thread and multi-thread stats are close enough.
-  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0));
 
   // Bit exact test in row_mt mode.
   // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
@@ -232,9 +233,9 @@ class VPxEncoderThreadTest
     psnr_ = 0.0;
     nframes_ = 0;
   }
-  virtual ~VPxEncoderThreadTest() {}
+  ~VPxEncoderThreadTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -251,14 +252,14 @@ class VPxEncoderThreadTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                          ::libvpx_test::Encoder *encoder) override {
     if (!encoder_initialized_) {
       // Encode 4 column tiles.
       encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
@@ -279,21 +280,21 @@ class VPxEncoderThreadTest
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t /*pts*/) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t /*pts*/) override {
     ::libvpx_test::MD5 md5_res;
     md5_res.Add(&img);
     md5_.push_back(md5_res.Get());
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder * /*decoder*/) {
+  bool HandleDecodeResult(const vpx_codec_err_t res,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder * /*decoder*/) override {
     if (res != VPX_CODEC_OK) {
       EXPECT_EQ(VPX_CODEC_OK, res);
       return false;
diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
new file mode 100644
index 0000000000..33fa05c65c
--- /dev/null
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -0,0 +1,979 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdint>
+#include <new>
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vp9/simple_encode.h"
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+namespace {
+
+constexpr int kModelMagicNumber = 51396;
+constexpr uintptr_t PrivMagicNumber = 5566;
+constexpr int kFrameNum = 5;
+constexpr int kFrameNumGOP = 30;
+constexpr int kFrameNumGOPShort = 4;
+constexpr int kLosslessCodingIndex = 2;
+constexpr int kFixedGOPSize = 9;
+// The range check in vp9_cx_iface.c shows that the max
+// lag in buffer is MAX_LAG_BUFFERS (25):
+// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+constexpr int kMaxLagInFrames = 25;
+constexpr int kDefaultMinGfInterval = 4;
+constexpr int kDefaultMaxGfInterval = 16;
+// The active gf interval might change for each GOP
+// See function "get_active_gf_inverval_range".
+// The numbers below are from manual inspection.
+constexpr int kReadMinGfInterval = 5;
+constexpr int kReadMaxGfInterval = 13;
+const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
+const double kPsnrThreshold = 30.4;
+
+struct ToyRateCtrl {
+  int magic_number;
+  int coding_index;
+
+  int gop_global_index;
+  int frames_since_key;
+  int show_index;
+};
+
+vpx_rc_status_t rc_create_model(void *priv,
+                                const vpx_rc_config_t *ratectrl_config,
+                                vpx_rc_model_t *rate_ctrl_model_ptr) {
+  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
+  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
+  toy_rate_ctrl->magic_number = kModelMagicNumber;
+  toy_rate_ctrl->coding_index = -1;
+  *rate_ctrl_model_ptr = toy_rate_ctrl;
+  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
+  EXPECT_EQ(ratectrl_config->frame_width, 352);
+  EXPECT_EQ(ratectrl_config->frame_height, 288);
+  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNum);
+  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 24000);
+  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
+  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_create_model_gop(void *priv,
+                                    const vpx_rc_config_t *ratectrl_config,
+                                    vpx_rc_model_t *rate_ctrl_model_ptr) {
+  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
+  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
+  toy_rate_ctrl->magic_number = kModelMagicNumber;
+  toy_rate_ctrl->gop_global_index = 0;
+  toy_rate_ctrl->frames_since_key = 0;
+  toy_rate_ctrl->show_index = 0;
+  toy_rate_ctrl->coding_index = 0;
+  *rate_ctrl_model_ptr = toy_rate_ctrl;
+  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
+  EXPECT_EQ(ratectrl_config->frame_width, 640);
+  EXPECT_EQ(ratectrl_config->frame_height, 360);
+  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP);
+  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000);
+  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
+  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_create_model_gop_short(
+    void *priv, const vpx_rc_config_t *ratectrl_config,
+    vpx_rc_model_t *rate_ctrl_model_ptr) {
+  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
+  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
+  toy_rate_ctrl->magic_number = kModelMagicNumber;
+  toy_rate_ctrl->gop_global_index = 0;
+  toy_rate_ctrl->frames_since_key = 0;
+  toy_rate_ctrl->show_index = 0;
+  toy_rate_ctrl->coding_index = 0;
+  *rate_ctrl_model_ptr = toy_rate_ctrl;
+  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
+  EXPECT_EQ(ratectrl_config->frame_width, 352);
+  EXPECT_EQ(ratectrl_config->frame_height, 288);
+  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort);
+  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500);
+  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
+  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_send_firstpass_stats(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_firstpass_stats_t *first_pass_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(first_pass_stats->num_frames, kFrameNum);
+  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+  }
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_send_firstpass_stats_gop(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_firstpass_stats_t *first_pass_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP);
+  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+  }
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_send_firstpass_stats_gop_short(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_firstpass_stats_t *first_pass_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort);
+  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+  }
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
+                                      const VpxTplGopStats *tpl_gop_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_GT(tpl_gop_stats->size, 0);
+
+  for (int i = 0; i < tpl_gop_stats->size; ++i) {
+    EXPECT_GT(tpl_gop_stats->frame_stats_list[i].num_blocks, 0);
+  }
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  toy_rate_ctrl->coding_index += 1;
+
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+  EXPECT_LT(encode_frame_info->show_index, kFrameNum);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 4);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+  } else if (encode_frame_info->coding_index >= 2 &&
+             encode_frame_info->coding_index < 5) {
+    // In the first group of pictures, coding_index and gop_index are equal.
+    EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+  } else if (encode_frame_info->coding_index == 5) {
+    EXPECT_EQ(encode_frame_info->show_index, 4);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              1);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              1);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              4);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[2],
+              1);  // kRefFrameTypeFuture
+  }
+  if (encode_frame_info->coding_index == kLosslessCodingIndex) {
+    // We should get sse == 0 at rc_update_encodeframe_result()
+    frame_decision->q_index = 0;
+  } else {
+    frame_decision->q_index = 100;
+  }
+  frame_decision->max_frame_size = 0;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+  } else if (encode_frame_info->coding_index == 3 ||
+             encode_frame_info->coding_index == 12 ||
+             encode_frame_info->coding_index == 21) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+  } else if (encode_frame_info->coding_index == 11 ||
+             encode_frame_info->coding_index == 20 ||
+             encode_frame_info->coding_index == 29) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+  } else if (encode_frame_info->coding_index >= 30) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+  }
+
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 2);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 3) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
+  }
+
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 2);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 3) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 3);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 4) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  }
+
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 2);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 3) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 3);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  }
+
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
+                                    const vpx_rc_gop_info_t *gop_info,
+                                    vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval);
+  EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames =
+      VPXMIN(kFixedGOPSize, gop_info->frames_to_key);
+  gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 2 GOPs.
+// The first GOP has 3 coding frames, no alt ref.
+// The second GOP has 1 coding frame, no alt ref.
+vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
+                                          const vpx_rc_gop_info_t *gop_info,
+                                          vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1;
+  gop_decision->use_alt_ref = 0;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 2 GOPs.
+// The first GOP has 4 coding frames. Use alt ref.
+// The second GOP only contains the overlay frame of the first GOP's alt ref
+// frame.
+vpx_rc_status_t rc_get_gop_decision_short_overlay(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+    vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
+  gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 1 GOP.
+// The GOP has 4 coding frames. Do not use alt ref.
+vpx_rc_status_t rc_get_gop_decision_short_no_arf(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+    vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
+  gop_decision->use_alt_ref = 0;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_update_encodeframe_result(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_result_t *encode_frame_result) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+  const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
+  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
+  if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
+    EXPECT_EQ(encode_frame_result->sse, 0);
+  }
+  if (toy_rate_ctrl->coding_index == kLosslessCodingIndex) {
+    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 0);
+  } else {
+    EXPECT_EQ(encode_frame_result->actual_encoding_qindex, 100);
+  }
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_update_encodeframe_result_gop(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_result_t *encode_frame_result) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+  const int64_t ref_pixel_count = 640 * 360 * 3 / 2;
+  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_update_encodeframe_result_gop_short(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_result_t *encode_frame_result) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+  const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
+  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_default_frame_rdmult(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  *rdmult = VPX_DEFAULT_RDMULT;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  delete toy_rate_ctrl;
+  return VPX_RC_OK;
+}
+
+class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
+                        public ::testing::Test {
+ protected:
+  ExtRateCtrlTest() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTest() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      vpx_rc_funcs_t rc_funcs = {};
+      rc_funcs.rc_type = VPX_RC_QP;
+      rc_funcs.create_model = rc_create_model;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision;
+      rc_funcs.update_encodeframe_result = rc_update_encodeframe_result;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTest, EncodeTest) {
+  cfg_.rc_target_bitrate = 24000;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
+      kFrameNum));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
+                           public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOP() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+
+      vpx_rc_funcs_t rc_funcs = {};
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
+      rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
+      rc_funcs.get_gop_decision = rc_get_gop_decision;
+      rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOP, EncodeTest) {
+  cfg_.rc_target_bitrate = 4000;
+  cfg_.g_lag_in_frames = kMaxLagInFrames;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0,
+      kFrameNumGOP));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
+                                public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOPShort() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
+
+      vpx_rc_funcs_t rc_funcs = {};
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShortOverlay
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOPShortOverlay() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
+
+      vpx_rc_funcs_t rc_funcs = {};
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision =
+          rc_get_encodeframe_decision_gop_short_overlay;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShortNoARF
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOPShortNoARF() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
+
+      vpx_rc_funcs_t rc_funcs = {};
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision =
+          rc_get_encodeframe_decision_gop_short_no_arf;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
+                              public ::testing::Test {
+ protected:
+  ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestRdmult() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void BeginPassHook(unsigned int) override {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      vpx_rc_funcs_t rc_funcs = {};
+      rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+};
+
+TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+
+  const double psnr = GetAveragePsnr();
+  EXPECT_GT(psnr, kPsnrThreshold);
+}
+
+}  // namespace
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 29f689f07d..c69d43efbc 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -32,7 +32,7 @@ typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left);
 
 struct IntraPredParam {
-  IntraPredParam(IntraPredFunc pred = NULL, IntraPredFunc ref = NULL,
+  IntraPredParam(IntraPredFunc pred = nullptr, IntraPredFunc ref = nullptr,
                  int block_size_value = 0, int bit_depth_value = 0)
       : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
         bit_depth(bit_depth_value) {}
@@ -55,6 +55,21 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
     ref_dst_ = ref_dst;
     int error_count = 0;
     for (int i = 0; i < count_test_block; ++i) {
+      // TODO(webm:1797): Some of the optimised predictor implementations rely
+      // on the trailing half of the above_row_ being a copy of the final
+      // element, however relying on this in some cases can cause the MD5 tests
+      // to fail. We have fixed all of these cases for Neon, so fill the whole
+      // of above_row_ randomly.
+#if HAVE_NEON
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x < 2 * block_size; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+#else
       // Fill edges with random data, try first with saturated values.
       for (int x = -1; x < block_size; x++) {
         if (i == 0) {
@@ -66,6 +81,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
       for (int x = block_size; x < 2 * block_size; x++) {
         above_row_[x] = above_row_[block_size - 1];
       }
+#endif
       for (int y = 0; y < block_size; y++) {
         if (i == 0) {
           left_col_[y] = mask_;
@@ -80,7 +96,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
     stride_ = params_.block_size * 3;
     mask_ = (1 << params_.bit_depth) - 1;
@@ -243,6 +259,22 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d45_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d45_predictor_32x32_neon,
                        &vpx_d45_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d63_predictor_4x4_neon, &vpx_d63_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_8x8_neon, &vpx_d63_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_16x16_neon,
+                       &vpx_d63_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d63_predictor_32x32_neon,
+                       &vpx_d63_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d117_predictor_16x16_neon,
+                       &vpx_d117_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d117_predictor_32x32_neon,
+                       &vpx_d117_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
                        4, 8),
         IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c,
@@ -251,6 +283,22 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d135_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d135_predictor_32x32_neon,
                        &vpx_d135_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d153_predictor_4x4_neon, &vpx_d153_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d153_predictor_8x8_neon, &vpx_d153_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d153_predictor_16x16_neon,
+                       &vpx_d153_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d153_predictor_32x32_neon,
+                       &vpx_d153_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d207_predictor_4x4_neon, &vpx_d207_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d207_predictor_8x8_neon, &vpx_d207_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d207_predictor_16x16_neon,
+                       &vpx_d207_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d207_predictor_32x32_neon,
+                       &vpx_d207_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_dc_128_predictor_4x4_neon,
                        &vpx_dc_128_predictor_4x4_c, 4, 8),
         IntraPredParam(&vpx_dc_128_predictor_8x8_neon,
@@ -441,13 +489,23 @@ INSTANTIATE_TEST_SUITE_P(
                                      &vpx_v_predictor_32x32_c, 32, 8)));
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_dc_predictor_8x8_lsx,
+                                     &vpx_dc_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_dc_predictor_16x16_lsx,
+                                     &vpx_dc_predictor_16x16_c, 16, 8)));
+#endif  // HAVE_LSX
+
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
                                 const uint16_t *above, const uint16_t *left,
                                 int bps);
 struct HighbdIntraPredParam {
-  HighbdIntraPredParam(HighbdIntraPred pred = NULL, HighbdIntraPred ref = NULL,
-                       int block_size_value = 0, int bit_depth_value = 0)
+  HighbdIntraPredParam(HighbdIntraPred pred = nullptr,
+                       HighbdIntraPred ref = nullptr, int block_size_value = 0,
+                       int bit_depth_value = 0)
       : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
         bit_depth(bit_depth_value) {}
 
@@ -457,6 +515,7 @@ struct HighbdIntraPredParam {
   int bit_depth;
 };
 
+#if HAVE_SSSE3 || HAVE_NEON || HAVE_SSE2
 template <>
 void IntraPredTest<uint16_t, HighbdIntraPredParam>::Predict() {
   const int bit_depth = params_.bit_depth;
@@ -466,6 +525,7 @@ void IntraPredTest<uint16_t, HighbdIntraPredParam>::Predict() {
 }
 
 typedef IntraPredTest<uint16_t, HighbdIntraPredParam> VP9HighbdIntraPredTest;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9HighbdIntraPredTest);
 
 TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
   // max block size is 32
@@ -475,6 +535,7 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
   DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 32 * 32]);
   RunTest(left_col, above_data, dst, ref_dst);
 }
+#endif
 
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
@@ -828,6 +889,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -836,6 +913,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -904,6 +997,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -912,6 +1021,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -980,6 +1105,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -988,6 +1129,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc
index 931ac30a36..fe3cd1aba4 100644
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -29,15 +29,15 @@ class LosslessTest
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)) {}
 
-  virtual ~LosslessTest() {}
+  ~LosslessTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       // Only call Control if quantizer > 0 to verify that using quantizer
       // alone will activate lossless
@@ -47,12 +47,12 @@ class LosslessTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0];
   }
 
diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc
index 68eecb8232..495ea11fce 100644
--- a/test/vp9_motion_vector_test.cc
+++ b/test/vp9_motion_vector_test.cc
@@ -42,9 +42,9 @@ class MotionVectorTestLarge
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
 
-  virtual ~MotionVectorTestLarge() {}
+  ~MotionVectorTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -59,8 +59,8 @@ class MotionVectorTestLarge
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
@@ -88,7 +88,7 @@ TEST_P(MotionVectorTestLarge, OverallTest) {
       "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160,  // 2048, 1080,
       30, 1, 0, 5));
 
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 083a8844bf..e00ab4022c 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -26,6 +26,7 @@
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
@@ -38,36 +39,83 @@ namespace {
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             int skip_block, const int16_t *zbin,
-                             const int16_t *round, const int16_t *quant,
-                             const int16_t *quant_shift, tran_low_t *qcoeff,
-                             tran_low_t *dqcoeff, const int16_t *dequant,
-                             uint16_t *eob, const int16_t *scan,
-                             const int16_t *iscan);
+                             const macroblock_plane *mb_plane,
+                             tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                             const int16_t *dequant, uint16_t *eob,
+                             const struct ScanOrder *const scan_order);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
+// Wrapper for 32x32 version which does not use count
+typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
+                                  const macroblock_plane *const mb_plane,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  const int16_t *dequant, uint16_t *eob,
+                                  const struct ScanOrder *const scan_order);
+
+template <Quantize32x32Func fn>
+void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
+                       const macroblock_plane *const mb_plane,
+                       tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                       const int16_t *dequant, uint16_t *eob,
+                       const struct ScanOrder *const scan_order) {
+  (void)count;
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
+}
+
 // Wrapper for FP version which does not use zbin or quant_shift.
 typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
-                               int skip_block, const int16_t *round,
-                               const int16_t *quant, tran_low_t *qcoeff,
-                               tran_low_t *dqcoeff, const int16_t *dequant,
-                               uint16_t *eob, const int16_t *scan,
-                               const int16_t *iscan);
+                               const macroblock_plane *const mb_plane,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               const int16_t *dequant, uint16_t *eob,
+                               const struct ScanOrder *const scan_order);
 
 template <QuantizeFPFunc fn>
-void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
-                    const int16_t *zbin, const int16_t *round,
-                    const int16_t *quant, const int16_t *quant_shift,
-                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
-                    const int16_t *iscan) {
-  (void)zbin;
-  (void)quant_shift;
-
-  fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob,
-     scan, iscan);
+void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
+                    const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                    tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                    const struct ScanOrder *const scan_order) {
+  fn(coeff, count, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
+}
+
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+                          int16_t *quant, int16_t *quant_shift,
+                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *quant_fp) {
+  // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V.
+  constexpr int kMaxQRoundingFactorFp = 64;
+
+  for (int j = 0; j < 2; j++) {
+    // The range is 4 to 1828 in the VP9 tables.
+    const int qlookup = rnd->RandRange(1825) + 4;
+    round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7;
+    quant_fp[j] = (1 << 16) / qlookup;
+
+    // Values determined by deconstructing vp9_init_quantizer().
+    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+    // values or U/V values of any bit depth. This is because y_delta is not
+    // factored into the vp9_ac_quant() call.
+    zbin[j] = rnd->RandRange(1200);
+
+    // round may be up to 685 for Y values or 914 for U/V.
+    round[j] = rnd->RandRange(914);
+    // quant ranges from 1 to -32703
+    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+    // quant_shift goes up to 1 << 16.
+    quant_shift[j] = rnd->RandRange(16384);
+    // dequant maxes out at 1828 for all cases.
+    dequant[j] = rnd->RandRange(1828);
+  }
+  for (int j = 2; j < 8; j++) {
+    zbin[j] = zbin[1];
+    round_fp[j] = round_fp[1];
+    quant_fp[j] = quant_fp[1];
+    round[j] = round[1];
+    quant[j] = quant[1];
+    quant_shift[j] = quant_shift[1];
+    dequant[j] = dequant[1];
+  }
 }
 
 class VP9QuantizeBase : public AbstractBench {
@@ -83,17 +131,21 @@ class VP9QuantizeBase : public AbstractBench {
 #else
     max_value_ = (1 << bit_depth_) - 1;
 #endif
-    zbin_ptr_ =
+
+    mb_plane_ = reinterpret_cast<macroblock_plane *>(
+        vpx_memalign(16, sizeof(macroblock_plane)));
+
+    zbin_ptr_ = mb_plane_->zbin =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
-    round_fp_ptr_ = reinterpret_cast<int16_t *>(
+    round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
-    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+    quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
-    round_ptr_ =
+    round_ptr_ = mb_plane_->round =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
-    quant_ptr_ =
+    quant_ptr_ = mb_plane_->quant =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
-    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+    quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
     dequant_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
@@ -102,7 +154,8 @@ class VP9QuantizeBase : public AbstractBench {
     q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
   }
 
-  ~VP9QuantizeBase() {
+  ~VP9QuantizeBase() override {
+    vpx_free(mb_plane_);
     vpx_free(zbin_ptr_);
     vpx_free(round_fp_ptr_);
     vpx_free(quant_fp_ptr_);
@@ -110,20 +163,22 @@ class VP9QuantizeBase : public AbstractBench {
     vpx_free(quant_ptr_);
     vpx_free(quant_shift_ptr_);
     vpx_free(dequant_ptr_);
-    zbin_ptr_ = NULL;
-    round_fp_ptr_ = NULL;
-    quant_fp_ptr_ = NULL;
-    round_ptr_ = NULL;
-    quant_ptr_ = NULL;
-    quant_shift_ptr_ = NULL;
-    dequant_ptr_ = NULL;
+    mb_plane_ = nullptr;
+    zbin_ptr_ = nullptr;
+    round_fp_ptr_ = nullptr;
+    quant_fp_ptr_ = nullptr;
+    round_ptr_ = nullptr;
+    quant_ptr_ = nullptr;
+    quant_shift_ptr_ = nullptr;
+    dequant_ptr_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
  protected:
+  macroblock_plane *mb_plane_;
   int16_t *zbin_ptr_;
-  int16_t *round_fp_ptr_;
   int16_t *quant_fp_ptr_;
+  int16_t *round_fp_ptr_;
   int16_t *round_ptr_;
   int16_t *quant_ptr_;
   int16_t *quant_shift_ptr_;
@@ -138,8 +193,7 @@ class VP9QuantizeBase : public AbstractBench {
   int16_t *r_ptr_;
   int16_t *q_ptr_;
   int count_;
-  int skip_block_;
-  const scan_order *scan_;
+  const ScanOrder *scan_;
   uint16_t eob_;
 };
 
@@ -151,32 +205,125 @@ class VP9QuantizeTest : public VP9QuantizeBase,
         quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
 
  protected:
-  virtual void Run();
+  void Run() override;
+  void Speed(bool is_median);
   const QuantizeFunc quantize_op_;
   const QuantizeFunc ref_quantize_op_;
 };
 
 void VP9QuantizeTest::Run() {
-  quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_,
-               q_ptr_, quant_shift_ptr_, qcoeff_.TopLeftPixel(),
-               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
-               scan_->iscan);
+  quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_);
+}
+
+void VP9QuantizeTest::Speed(bool is_median) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
+  TX_SIZE starting_sz, ending_sz;
+
+  if (max_size_ == 16) {
+    starting_sz = TX_4X4;
+    ending_sz = TX_16X16;
+  } else {
+    starting_sz = TX_32X32;
+    ending_sz = TX_32X32;
+  }
+
+  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+    // zbin > coeff, zbin < coeff.
+    for (int i = 0; i < 2; ++i) {
+      // TX_TYPE defines the scan order. That is not relevant to the speed test.
+      // Pick the first one.
+      const TX_TYPE tx_type = DCT_DCT;
+      count_ = (4 << sz) * (4 << sz);
+      scan_ = &vp9_scan_orders[sz][tx_type];
+
+      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_fp_ptr_);
+
+      if (i == 0) {
+        // When |coeff values| are less than zbin the results are 0.
+        int threshold = 100;
+        if (max_size_ == 32) {
+          // For 32x32, the threshold is halved. Double it to keep the values
+          // from clearing it.
+          threshold = 200;
+        }
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+        coeff_.Set(&rnd, -99, 99);
+      } else if (i == 1) {
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+        coeff_.Set(&rnd, -500, 500);
+      }
+
+      const char *type =
+          (i == 0) ? "Bypass calculations " : "Full calculations ";
+      char block_size[16];
+      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+      char title[100];
+      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+
+      if (is_median) {
+        RunNTimes(10000000 / count_);
+        PrintMedian(title);
+      } else {
+        Buffer<tran_low_t> ref_qcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_qcoeff.Init());
+        Buffer<tran_low_t> ref_dqcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_dqcoeff.Init());
+        uint16_t ref_eob = 0;
+
+        const int kNumTests = 5000000;
+        vpx_usec_timer timer, simd_timer;
+
+        vpx_usec_timer_start(&timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                           ref_qcoeff.TopLeftPixel(),
+                           ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                           scan_);
+        }
+        vpx_usec_timer_mark(&timer);
+
+        vpx_usec_timer_start(&simd_timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                       qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+                       dequant_ptr_, &eob_, scan_);
+        }
+        vpx_usec_timer_mark(&simd_timer);
+
+        const int elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&timer));
+        const int simd_elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&simd_timer));
+        printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title,
+               elapsed_time, simd_elapsed_time,
+               ((float)elapsed_time / simd_elapsed_time));
+      }
+    }
+  }
 }
 
 // This quantizer compares the AC coefficients to the quantization step size to
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
 inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *round_ptr,
-                        const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan, int is_32x32) {
+                        const struct macroblock_plane *const mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order,
+                        int is_32x32) {
   int i, eob = -1;
   const int thr = dequant_ptr[1] >> (1 + is_32x32);
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
@@ -243,62 +390,21 @@ inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
-              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
+                      const struct macroblock_plane *mb_plane,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const struct ScanOrder *const scan_order) {
+  quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr,
+              dequant_ptr, eob_ptr, scan_order, 0);
 }
 
 void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan,
-                            const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
-              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
-}
-
-void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
-                          int16_t *quant, int16_t *quant_shift,
-                          int16_t *dequant, int16_t *round_fp,
-                          int16_t *quant_fp) {
-  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
-  const int max_qrounding_factor_fp = 64;
-
-  for (int j = 0; j < 2; j++) {
-    // The range is 4 to 1828 in the VP9 tables.
-    const int qlookup = rnd->RandRange(1825) + 4;
-    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
-    quant_fp[j] = (1 << 16) / qlookup;
-
-    // Values determined by deconstructing vp9_init_quantizer().
-    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
-    // values or U/V values of any bit depth. This is because y_delta is not
-    // factored into the vp9_ac_quant() call.
-    zbin[j] = rnd->RandRange(1200);
-
-    // round may be up to 685 for Y values or 914 for U/V.
-    round[j] = rnd->RandRange(914);
-    // quant ranges from 1 to -32703
-    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
-    // quant_shift goes up to 1 << 16.
-    quant_shift[j] = rnd->RandRange(16384);
-    // dequant maxes out at 1828 for all cases.
-    dequant[j] = rnd->RandRange(1828);
-  }
-  for (int j = 2; j < 8; j++) {
-    zbin[j] = zbin[1];
-    round_fp[j] = round_fp[1];
-    quant_fp[j] = quant_fp[1];
-    round[j] = round[1];
-    quant[j] = quant[1];
-    quant_shift[j] = quant_shift[1];
-    dequant[j] = dequant[1];
-  }
+                            const struct macroblock_plane *mb_plane,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const struct ScanOrder *const scan_order) {
+  quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr,
+              dequant_ptr, eob_ptr, scan_order, 1);
 }
 
 TEST_P(VP9QuantizeTest, OperationCheck) {
@@ -316,9 +422,6 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
   eob_ = 0;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    // Test skip block for the first three iterations to catch all the different
-    // sizes.
-    const int skip_block = 0;
     TX_SIZE sz;
     if (max_size_ == 16) {
       sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
@@ -332,15 +435,13 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_,
-                     r_ptr_, q_ptr_, quant_shift_ptr_,
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
                      ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+                     dequant_ptr_, &ref_eob, scan_);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+        coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -372,7 +473,6 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
   const uint32_t max_index = max_size_ * max_size_ - 1;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    skip_block_ = 0;
     TX_SIZE sz;
     if (max_size_ == 16) {
       sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
@@ -391,15 +491,13 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_,
-                     r_ptr_, q_ptr_, quant_shift_ptr_,
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
                      ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+                     dequant_ptr_, &ref_eob, scan_);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+        coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -415,61 +513,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
   }
 }
 
-TEST_P(VP9QuantizeTest, DISABLED_Speed) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  ASSERT_TRUE(coeff_.Init());
-  ASSERT_TRUE(qcoeff_.Init());
-  ASSERT_TRUE(dqcoeff_.Init());
-  TX_SIZE starting_sz, ending_sz;
+TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); }
 
-  if (max_size_ == 16) {
-    starting_sz = TX_4X4;
-    ending_sz = TX_16X16;
-  } else {
-    starting_sz = TX_32X32;
-    ending_sz = TX_32X32;
-  }
-
-  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
-    // zbin > coeff, zbin < coeff.
-    for (int i = 0; i < 2; ++i) {
-      skip_block_ = 0;
-      // TX_TYPE defines the scan order. That is not relevant to the speed test.
-      // Pick the first one.
-      const TX_TYPE tx_type = DCT_DCT;
-      count_ = (4 << sz) * (4 << sz);
-      scan_ = &vp9_scan_orders[sz][tx_type];
-
-      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
-                           quant_fp_ptr_);
-
-      if (i == 0) {
-        // When |coeff values| are less than zbin the results are 0.
-        int threshold = 100;
-        if (max_size_ == 32) {
-          // For 32x32, the threshold is halved. Double it to keep the values
-          // from clearing it.
-          threshold = 200;
-        }
-        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
-        coeff_.Set(&rnd, -99, 99);
-      } else if (i == 1) {
-        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
-        coeff_.Set(&rnd, -500, 500);
-      }
-
-      RunNTimes(10000000 / count_);
-      const char *type =
-          (i == 0) ? "Bypass calculations " : "Full calculations ";
-      char block_size[16];
-      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
-      char title[100];
-      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
-      PrintMedian(title);
-    }
-  }
-}
+TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); }
 
 using std::make_tuple;
 
@@ -478,25 +524,30 @@ using std::make_tuple;
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
                    VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
                    VPX_BITS_12, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
 
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+    ::testing::Values(make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
@@ -505,64 +556,126 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-#if VPX_ARCH_X86_64
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
+    ::testing::Values(make_tuple(vpx_quantize_b_ssse3, vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true)));
-#else
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false)));
-
-#endif  // VPX_ARCH_X86_64
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
-INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
-                         ::testing::Values(make_tuple(&vpx_quantize_b_avx,
-                                                      &vpx_quantize_b_c,
-                                                      VPX_BITS_8, 16, false),
-                                           make_tuple(&vpx_quantize_b_32x32_avx,
-                                                      &vpx_quantize_b_32x32_c,
-                                                      VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(vpx_quantize_b_avx, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
+                   true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
+                   32, true),
+        make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
     ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true)));
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
+                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                                 VPX_BITS_8, 32, true),
+                      make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_neon,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
                                  16, true),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
                                  VPX_BITS_8, 32, true)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
@@ -581,13 +694,24 @@ INSTANTIATE_TEST_SUITE_P(
                                  VPX_BITS_8, 32, true)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
 
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_lsx, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_lsx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
 // Only useful to compare "Speed" test results.
 INSTANTIATE_TEST_SUITE_P(
     DISABLED_C, VP9QuantizeTest,
     ::testing::Values(
         make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
-                   32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
new file mode 100644
index 0000000000..f7be47542c
--- /dev/null
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -0,0 +1,672 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vp9/ratectrl_rtc.h"
+
+#include <fstream>  // NOLINT
+#include <string>
+
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+const size_t kNumFrames = 300;
+
+const int kTemporalId3Layer[4] = { 0, 2, 1, 2 };
+const int kTemporalId2Layer[2] = { 0, 1 };
+const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 };
+const int kTemporalRateAllocation2Layer[2] = { 60, 100 };
+const int kSpatialLayerBitrate[3] = { 200, 400, 1000 };
+const int kSpatialLayerBitrateLow[3] = { 50, 100, 400 };
+
+class RcInterfaceTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, vpx_rc_mode> {
+ public:
+  RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+        encoder_exit_(false), frame_drop_thresh_(0), num_drops_(0) {}
+
+  ~RcInterfaceTest() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      if (rc_cfg_.is_screen) {
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN);
+      } else {
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_DEFAULT);
+      }
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+      encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
+    }
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
+    if (rc_cfg_.rc_mode == VPX_CBR &&
+        frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
+      // Disable golden frame update.
+      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+    }
+    encoder_exit_ = video->frame() == kNumFrames;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    if (encoder_exit_) {
+      return;
+    }
+    int loopfilter_level, qp;
+    encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
+    encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+    } else {
+      num_drops_++;
+    }
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_);
+  }
+
+  void RunOneLayer() {
+    SetConfig(GET_PARAM(2));
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerScreen() {
+    SetConfig(GET_PARAM(2));
+    rc_cfg_.is_screen = true;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerDropFramesCBR() {
+    if (GET_PARAM(2) != VPX_CBR) {
+      GTEST_SKIP() << "Frame dropping is only for CBR mode.";
+    }
+    frame_drop_thresh_ = 30;
+    SetConfig(GET_PARAM(2));
+    // Use lower bitrate, lower max-q, and enable frame dropper.
+    rc_cfg_.target_bandwidth = 200;
+    cfg_.rc_target_bitrate = 200;
+    rc_cfg_.max_quantizer = 50;
+    cfg_.rc_max_quantizer = 50;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
+  void RunOneLayerVBRPeriodicKey() {
+    if (GET_PARAM(2) != VPX_VBR) return;
+    key_interval_ = 100;
+    SetConfig(VPX_VBR);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  void SetConfig(vpx_rc_mode rc_mode) {
+    rc_cfg_.width = 1280;
+    rc_cfg_.height = 720;
+    rc_cfg_.max_quantizer = 52;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = 1000;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = 1000;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.ss_number_layers = 1;
+    rc_cfg_.ts_number_layers = 1;
+    rc_cfg_.scaling_factor_num[0] = 1;
+    rc_cfg_.scaling_factor_den[0] = 1;
+    rc_cfg_.layer_target_bitrate[0] = 1000;
+    rc_cfg_.max_quantizers[0] = 52;
+    rc_cfg_.min_quantizers[0] = 2;
+    rc_cfg_.rc_mode = rc_mode;
+    rc_cfg_.aq_mode = aq_mode_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = 1280;
+    cfg_.g_h = 720;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 52;
+    cfg_.rc_end_usage = rc_mode;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
+  }
+
+  std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
+  libvpx::VP9RateControlRtcConfig rc_cfg_;
+  int aq_mode_;
+  int key_interval_;
+  libvpx::VP9FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+  int frame_drop_thresh_;
+  int num_drops_;
+};
+
+class RcInterfaceSvcTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, bool> {
+ public:
+  RcInterfaceSvcTest()
+      : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+        dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)),
+        parallel_spatial_layers_(false), frame_drop_thresh_(0),
+        max_consec_drop_(INT_MAX), num_drops_(0) {}
+  ~RcInterfaceSvcTest() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      current_superframe_ = 0;
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
+      encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      if (inter_layer_pred_off_) {
+        encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED,
+                         INTER_LAYER_PRED_OFF_NONKEY);
+      }
+      if (frame_drop_thresh_ > 0) {
+        vpx_svc_frame_drop_t svc_drop_frame;
+        svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP;
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl)
+          svc_drop_frame.framedrop_thresh[sl] = frame_drop_thresh_;
+        svc_drop_frame.max_consec_drop = max_consec_drop_;
+        encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
+      }
+    }
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
+    encoder_exit_ = video->frame() == kNumFrames;
+    if (dynamic_spatial_layers_ == 1) {
+      if (video->frame() == 100) {
+        // Go down to 2 spatial layers: set top SL to 0 bitrate.
+        // Update the encoder config.
+        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8];
+        cfg_.layer_target_bitrate[6] = 0;
+        cfg_.layer_target_bitrate[7] = 0;
+        cfg_.layer_target_bitrate[8] = 0;
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[8];
+        rc_cfg_.layer_target_bitrate[6] = 0;
+        rc_cfg_.layer_target_bitrate[7] = 0;
+        rc_cfg_.layer_target_bitrate[8] = 0;
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+      } else if (video->frame() == 200) {
+        // Go down to 1 spatial layer.
+        // Update the encoder config.
+        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[5];
+        cfg_.layer_target_bitrate[3] = 0;
+        cfg_.layer_target_bitrate[4] = 0;
+        cfg_.layer_target_bitrate[5] = 0;
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[5];
+        rc_cfg_.layer_target_bitrate[3] = 0;
+        rc_cfg_.layer_target_bitrate[4] = 0;
+        rc_cfg_.layer_target_bitrate[5] = 0;
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+      } else if (/*DISABLES CODE*/ (false) && video->frame() == 280) {
+        // TODO(marpan): Re-enable this going back up when issue is fixed.
+        // Go back up to 3 spatial layers.
+        // Update the encoder config: use the original bitrates.
+        SetEncoderConfigSvc(3, 3);
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        SetRCConfigSvc(3, 3);
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+      }
+    }
+  }
+
+  virtual void SetFrameParamsSvc(int sl) {
+    frame_params_.spatial_layer_id = sl;
+    if (rc_cfg_.ts_number_layers == 3)
+      frame_params_.temporal_layer_id =
+          kTemporalId3Layer[current_superframe_ % 4];
+    else if (rc_cfg_.ts_number_layers == 2)
+      frame_params_.temporal_layer_id =
+          kTemporalId2Layer[current_superframe_ % 2];
+    else
+      frame_params_.temporal_layer_id = 0;
+    frame_params_.frame_type =
+        current_superframe_ % key_interval_ == 0 && sl == 0
+            ? libvpx::RcFrameType::kKeyFrame
+            : libvpx::RcFrameType::kInterFrame;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    if (encoder_exit_) {
+      return;
+    }
+    int superframe_is_dropped = false;
+    ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
+    for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
+    std::vector<int> rc_qp;
+    // For FULL_SUPERFRAME_DROP: the full superframe drop decision is
+    // determined on the base spatial layer.
+    SetFrameParamsSvc(0);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kDrop) {
+      superframe_is_dropped = true;
+      num_drops_++;
+    }
+    while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      ASSERT_EQ(superframe_is_dropped, false);
+      ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
+                           pkt->data.frame.sz);
+      if (!parallel_spatial_layers_ || current_superframe_ == 0) {
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          if (sizes_[sl] > 0) {
+            SetFrameParamsSvc(sl);
+            // For sl=0 ComputeQP() is already called above (line 310).
+            if (sl > 0) rc_api_->ComputeQP(frame_params_);
+            rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_);
+            rc_qp.push_back(rc_api_->GetQP());
+          }
+        }
+      } else {
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          // For sl=0 ComputeQP() is already called above (line 310).
+          if (sizes_[sl] > 0 && sl > 0) {
+            SetFrameParamsSvc(sl);
+            rc_api_->ComputeQP(frame_params_);
+          }
+        }
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          if (sizes_[sl] > 0) {
+            SetFrameParamsSvc(sl);
+            rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_);
+            rc_qp.push_back(rc_api_->GetQP());
+          }
+        }
+      }
+    }
+    if (!superframe_is_dropped) {
+      int loopfilter_level;
+      std::vector<int> encoder_qp(VPX_SS_MAX_LAYERS, 0);
+      encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
+      encoder->Control(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, encoder_qp.data());
+      encoder_qp.resize(rc_qp.size());
+      ASSERT_EQ(rc_qp, encoder_qp);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+      current_superframe_++;
+    }
+  }
+  // This method needs to be overridden because non-reference frames are
+  // expected to be mismatched frames as the encoder will avoid loopfilter on
+  // these frames.
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {}
+
+  void RunSvc() {
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcDropFramesCBR() {
+    max_consec_drop_ = 10;
+    frame_drop_thresh_ = 30;
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
+  void RunSvcPeriodicKey() {
+    SetRCConfigSvc(3, 3);
+    key_interval_ = 100;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcDynamicSpatial() {
+    dynamic_spatial_layers_ = 1;
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcParallelSpatialLayers() {
+    if (!inter_layer_pred_off_) return;
+    parallel_spatial_layers_ = true;
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  vpx_codec_err_t ParseSuperframeSizes(const uint8_t *data, size_t data_sz) {
+    uint8_t marker = *(data + data_sz - 1);
+    if ((marker & 0xe0) == 0xc0) {
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frames;
+      // This chunk is marked as having a superframe index but doesn't have
+      // enough data for it, thus it's an invalid superframe index.
+      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+      {
+        const uint8_t marker2 = *(data + data_sz - index_sz);
+        // This chunk is marked as having a superframe index but doesn't have
+        // the matching marker byte at the front of the index therefore it's an
+        // invalid chunk.
+        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+      }
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+      for (uint32_t i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (uint32_t j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+        sizes_[i] = this_sz;
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
+  void SetEncoderConfigSvc(int number_spatial_layers,
+                           int number_temporal_layers) {
+    cfg_.g_w = 1280;
+    cfg_.g_h = 720;
+    cfg_.ss_number_layers = number_spatial_layers;
+    cfg_.ts_number_layers = number_temporal_layers;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 30;
+    if (number_spatial_layers == 3) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 4;
+      svc_params_.scaling_factor_num[1] = 2;
+      svc_params_.scaling_factor_den[1] = 4;
+      svc_params_.scaling_factor_num[2] = 4;
+      svc_params_.scaling_factor_den[2] = 4;
+    } else if (number_spatial_layers == 2) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 2;
+      svc_params_.scaling_factor_num[1] = 2;
+      svc_params_.scaling_factor_den[1] = 2;
+    } else if (number_spatial_layers == 1) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 1;
+    }
+
+    for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
+      svc_params_.max_quantizers[i] = 56;
+      svc_params_.min_quantizers[i] = 2;
+      svc_params_.speed_per_layer[i] = 7;
+      svc_params_.loopfilter_ctrl[i] = LOOPFILTER_ALL;
+    }
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    if (number_temporal_layers == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.temporal_layering_mode = 3;
+    } else if (number_temporal_layers == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.temporal_layering_mode = 2;
+    } else if (number_temporal_layers == 1) {
+      cfg_.ts_rate_decimator[0] = 1;
+      cfg_.temporal_layering_mode = 0;
+    }
+
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.g_threads = 1;
+    cfg_.kf_max_dist = 9999;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
+
+    cfg_.rc_target_bitrate = 0;
+    for (int sl = 0; sl < number_spatial_layers; sl++) {
+      int spatial_bitrate = 0;
+      if (number_spatial_layers <= 3)
+        spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl]
+                                                 : kSpatialLayerBitrate[sl];
+      for (int tl = 0; tl < number_temporal_layers; tl++) {
+        int layer = sl * number_temporal_layers + tl;
+        if (number_temporal_layers == 3)
+          cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 2)
+          cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 1)
+          cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+      }
+      cfg_.rc_target_bitrate += spatial_bitrate;
+    }
+
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+  }
+
+  void SetRCConfigSvc(int number_spatial_layers, int number_temporal_layers) {
+    rc_cfg_.width = 1280;
+    rc_cfg_.height = 720;
+    rc_cfg_.ss_number_layers = number_spatial_layers;
+    rc_cfg_.ts_number_layers = number_temporal_layers;
+    rc_cfg_.max_quantizer = 56;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.buf_initial_sz = 500;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = 1000;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 900;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.rc_mode = VPX_CBR;
+    rc_cfg_.aq_mode = aq_mode_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+    rc_cfg_.max_consec_drop = max_consec_drop_;
+
+    if (number_spatial_layers == 3) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 4;
+      rc_cfg_.scaling_factor_num[1] = 2;
+      rc_cfg_.scaling_factor_den[1] = 4;
+      rc_cfg_.scaling_factor_num[2] = 4;
+      rc_cfg_.scaling_factor_den[2] = 4;
+    } else if (number_spatial_layers == 2) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 2;
+      rc_cfg_.scaling_factor_num[1] = 2;
+      rc_cfg_.scaling_factor_den[1] = 2;
+    } else if (number_spatial_layers == 1) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 1;
+    }
+
+    if (number_temporal_layers == 3) {
+      rc_cfg_.ts_rate_decimator[0] = 4;
+      rc_cfg_.ts_rate_decimator[1] = 2;
+      rc_cfg_.ts_rate_decimator[2] = 1;
+    } else if (number_temporal_layers == 2) {
+      rc_cfg_.ts_rate_decimator[0] = 2;
+      rc_cfg_.ts_rate_decimator[1] = 1;
+    } else if (number_temporal_layers == 1) {
+      rc_cfg_.ts_rate_decimator[0] = 1;
+    }
+
+    rc_cfg_.target_bandwidth = 0;
+    for (int sl = 0; sl < number_spatial_layers; sl++) {
+      int spatial_bitrate = 0;
+      if (number_spatial_layers <= 3)
+        spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl]
+                                                 : kSpatialLayerBitrate[sl];
+      for (int tl = 0; tl < number_temporal_layers; tl++) {
+        int layer = sl * number_temporal_layers + tl;
+        if (number_temporal_layers == 3)
+          rc_cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 2)
+          rc_cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 1)
+          rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+      }
+      rc_cfg_.target_bandwidth += spatial_bitrate;
+    }
+
+    for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) {
+      for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) {
+        const int i = sl * rc_cfg_.ts_number_layers + tl;
+        rc_cfg_.max_quantizers[i] = 56;
+        rc_cfg_.min_quantizers[i] = 2;
+      }
+    }
+  }
+
+  int aq_mode_;
+  std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
+  libvpx::VP9RateControlRtcConfig rc_cfg_;
+  vpx_svc_extra_cfg_t svc_params_;
+  libvpx::VP9FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+  int current_superframe_;
+  uint32_t sizes_[8];
+  int key_interval_;
+  int dynamic_spatial_layers_;
+  bool inter_layer_pred_off_;
+  // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC.
+  bool parallel_spatial_layers_;
+  int frame_drop_thresh_;
+  int max_consec_drop_;
+  int num_drops_;
+};
+
+TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); }
+
+TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
+TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
+
+TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
+
+TEST_P(RcInterfaceSvcTest, SvcDropFramesCBR) { RunSvcDropFramesCBR(); }
+
+TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) {
+  RunSvcParallelSpatialLayers();
+}
+
+TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
+
+TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); }
+
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
+                           ::testing::Values(VPX_CBR, VPX_VBR));
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3),
+                           ::testing::Values(true, false));
+}  // namespace
diff --git a/test/vp9_roi_test.cc b/test/vp9_roi_test.cc
new file mode 100644
index 0000000000..a9347fb365
--- /dev/null
+++ b/test/vp9_roi_test.cc
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+#define MASK_WIDTH 40
+#define MASK_HEIGHT 30
+#define MASK_SIZE MASK_WIDTH *MASK_HEIGHT
+
+namespace {
+
+const int mask[MASK_SIZE] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0
+};
+
+class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest,
+                              public ::testing::Test {
+ protected:
+  RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {}
+  ~RoiMaskBackgroundSkip() override { free(roi_.roi_map); }
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    SetRoi();
+  }
+
+  void SetRoi() {
+    const int block_size = 8;
+    unsigned int i, j;
+    roi_.rows = (cfg_.g_h + block_size - 1) / block_size;
+    roi_.cols = (cfg_.g_w + block_size - 1) / block_size;
+    memset(&roi_.skip, 0, sizeof(roi_.skip));
+    memset(&roi_.delta_q, 0, sizeof(roi_.delta_q));
+    memset(&roi_.delta_lf, 0, sizeof(roi_.delta_lf));
+    memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
+    roi_.ref_frame[1] = 1;
+    // Use segment 3 for skip.
+    roi_.skip[3] = 1;
+    roi_.roi_map =
+        (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+    for (i = 0; i < roi_.rows; ++i) {
+      for (j = 0; j < roi_.cols; ++j) {
+        const int idx = i * roi_.cols + j;
+        if (mask[idx] == 1) roi_.roi_map[idx] = 3;
+      }
+    }
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
+    }
+    encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+  }
+
+ private:
+  vpx_roi_map_t roi_;
+};
+
+TEST_F(RoiMaskBackgroundSkip, RoiMaskNoMismatch) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 50;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.kf_max_dist = 9999;
+
+  ::libvpx_test::I420VideoSource video("desktopqvga.320_240.yuv", 320, 240, 30,
+                                       1, 0, 150);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+}  // namespace
diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc
index 2d1203fb89..049a10a617 100644
--- a/test/vp9_scale_test.cc
+++ b/test/vp9_scale_test.cc
@@ -33,10 +33,10 @@ typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src,
 class ScaleTest : public VpxScaleBase,
                   public ::testing::TestWithParam<ScaleFrameFunc> {
  public:
-  virtual ~ScaleTest() {}
+  ~ScaleTest() override = default;
 
  protected:
-  virtual void SetUp() { scale_fn_ = GetParam(); }
+  void SetUp() override { scale_fn_ = GetParam(); }
 
   void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
     vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler);
diff --git a/test/vp9_skip_loopfilter_test.cc b/test/vp9_skip_loopfilter_test.cc
index 1fb7e48485..c080a2caae 100644
--- a/test/vp9_skip_loopfilter_test.cc
+++ b/test/vp9_skip_loopfilter_test.cc
@@ -24,10 +24,11 @@ const char kVp9Md5File[] = "vp90-2-08-tile_1x8_frame_parallel.webm.md5";
 // Class for testing shutting off the loop filter.
 class SkipLoopFilterTest {
  public:
-  SkipLoopFilterTest() : video_(NULL), decoder_(NULL), md5_file_(NULL) {}
+  SkipLoopFilterTest()
+      : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {}
 
   ~SkipLoopFilterTest() {
-    if (md5_file_ != NULL) fclose(md5_file_);
+    if (md5_file_ != nullptr) fclose(md5_file_);
     delete decoder_;
     delete video_;
   }
@@ -37,8 +38,8 @@ class SkipLoopFilterTest {
     expected_md5_[0] = '\0';
     junk_[0] = '\0';
     video_ = new libvpx_test::WebMVideoSource(kVp9TestFile);
-    if (video_ == NULL) {
-      EXPECT_TRUE(video_ != NULL);
+    if (video_ == nullptr) {
+      EXPECT_NE(video_, nullptr);
       return false;
     }
     video_->Init();
@@ -47,8 +48,8 @@ class SkipLoopFilterTest {
     vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     if (num_threads > 0) cfg.threads = num_threads;
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
-    if (decoder_ == NULL) {
-      EXPECT_TRUE(decoder_ != NULL);
+    if (decoder_ == nullptr) {
+      EXPECT_NE(decoder_, nullptr);
       return false;
     }
 
@@ -73,7 +74,7 @@ class SkipLoopFilterTest {
   }
 
   vpx_codec_err_t DecodeRemainingFrames() {
-    for (; video_->cxdata() != NULL; video_->Next()) {
+    for (; video_->cxdata() != nullptr; video_->Next()) {
       const vpx_codec_err_t res =
           decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
       if (res != VPX_CODEC_OK) return res;
@@ -93,13 +94,13 @@ class SkipLoopFilterTest {
   // TODO(fgalligan): Move the MD5 testing code into another class.
   void OpenMd5File(const std::string &md5_file_name) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name);
-    ASSERT_TRUE(md5_file_ != NULL)
+    ASSERT_NE(md5_file_, nullptr)
         << "MD5 file open failed. Filename: " << md5_file_name;
   }
 
   // Reads the next line of the MD5 file.
   void ReadMd5() {
-    ASSERT_TRUE(md5_file_ != NULL);
+    ASSERT_NE(md5_file_, nullptr);
     const int res = fscanf(md5_file_, "%s  %s", expected_md5_, junk_);
     ASSERT_NE(EOF, res) << "Read md5 data failed";
     expected_md5_[32] = '\0';
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index ef8cc207d6..78deb51909 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -17,9 +18,11 @@
 #include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/util.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_ports/msvc.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
                              ptrdiff_t diff_stride, const uint8_t *src_ptr,
@@ -31,10 +34,10 @@ namespace vp9 {
 class VP9SubtractBlockTest : public AbstractBench,
                              public ::testing::TestWithParam<SubtractFunc> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run() {
+  void Run() override {
     GetParam()(block_height_, block_width_, diff_, block_width_, src_,
                block_width_, pred_, block_width_);
   }
@@ -133,6 +136,10 @@ INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest,
 INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_sse2));
 #endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_avx2));
+#endif
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_neon));
@@ -152,4 +159,163 @@ INSTANTIATE_TEST_SUITE_P(VSX, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_vsx));
 #endif
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_lsx));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+                                ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                                ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                                ptrdiff_t pred_stride, int bd);
+
+// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func>
+using Params = std::tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc>;
+
+class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  void SetUp() override {
+    block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)];
+    block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)];
+    bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(1));
+    func_ = GET_PARAM(2);
+    ref_func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    constexpr size_t kMaxWidth = 128;
+    constexpr size_t kMaxBlockSize = kMaxWidth * kMaxWidth;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+    ASSERT_NE(src_, nullptr);
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+    ASSERT_NE(pred_, nullptr);
+    diff_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(int16_t)));
+    ASSERT_NE(diff_, nullptr);
+  }
+
+  void TearDown() override {
+    vpx_free(CONVERT_TO_SHORTPTR(src_));
+    vpx_free(CONVERT_TO_SHORTPTR(pred_));
+    vpx_free(diff_);
+  }
+
+ protected:
+  void CheckResult();
+  void RunForSpeed();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  vpx_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  HBDSubtractFunc ref_func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void VPXHBDSubtractBlockTest::CheckResult() {
+  constexpr int kTestNum = 100;
+  constexpr int kMaxWidth = 128;
+  constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+  const int mask = (1 << bit_depth_) - 1;
+  for (int i = 0; i < kTestNum; ++i) {
+    for (int j = 0; j < kMaxBlockSize; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(VPXHBDSubtractBlockTest, CheckResult) { CheckResult(); }
+
+void VPXHBDSubtractBlockTest::RunForSpeed() {
+  constexpr int kTestNum = 200000;
+  constexpr int kMaxWidth = 128;
+  constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+  const int mask = (1 << bit_depth_) - 1;
+
+  if (ref_func_ == func_) GTEST_SKIP();
+
+  for (int j = 0; j < kMaxBlockSize; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  vpx_usec_timer ref_timer;
+  vpx_usec_timer_start(&ref_timer);
+  for (int i = 0; i < kTestNum; ++i) {
+    ref_func_(block_height_, block_width_, diff_, block_width_, src_,
+              block_width_, pred_, block_width_, bit_depth_);
+  }
+  vpx_usec_timer_mark(&ref_timer);
+  const int64_t ref_elapsed_time = vpx_usec_timer_elapsed(&ref_timer);
+
+  for (int j = 0; j < kMaxBlockSize; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  vpx_usec_timer timer;
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kTestNum; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int64_t elapsed_time = vpx_usec_timer_elapsed(&timer);
+
+  printf(
+      "[%dx%d]: "
+      "ref_time=%6" PRId64 " \t simd_time=%6" PRId64
+      " \t "
+      "gain=%f \n",
+      block_width_, block_height_, ref_elapsed_time, elapsed_time,
+      static_cast<double>(ref_elapsed_time) /
+          static_cast<double>(elapsed_time));
+}
+
+TEST_P(VPXHBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
+
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+                                       BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
+                                       BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+                                       BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+                                       BLOCK_64X64 };
+
+INSTANTIATE_TEST_SUITE_P(
+    C, VPXHBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&vpx_highbd_subtract_block_c),
+                       ::testing::Values(&vpx_highbd_subtract_block_c)));
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VPXHBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&vpx_highbd_subtract_block_avx2),
+                       ::testing::Values(&vpx_highbd_subtract_block_c)));
+#endif  // HAVE_AVX2
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace vp9
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 67f2bec800..c0cea681d7 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -26,10 +26,10 @@ using std::string;
 
 class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> {
  protected:
-  virtual ~VPxWorkerThreadTest() {}
-  virtual void SetUp() { vpx_get_worker_interface()->init(&worker_); }
+  ~VPxWorkerThreadTest() override = default;
+  void SetUp() override { vpx_get_worker_interface()->init(&worker_); }
 
-  virtual void TearDown() { vpx_get_worker_interface()->end(&worker_); }
+  void TearDown() override { vpx_get_worker_interface()->end(&worker_); }
 
   void Run(VPxWorker *worker) {
     const bool synchronous = GetParam();
@@ -128,18 +128,18 @@ TEST_P(VPxWorkerThreadTest, EndWithoutSync) {
 }
 
 TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
-  EXPECT_EQ(0, vpx_set_worker_interface(NULL));
-  EXPECT_TRUE(vpx_get_worker_interface() != NULL);
+  EXPECT_EQ(0, vpx_set_worker_interface(nullptr));
+  EXPECT_NE(vpx_get_worker_interface(), nullptr);
   for (int i = 0; i < 6; ++i) {
     VPxWorkerInterface winterface = *vpx_get_worker_interface();
     switch (i) {
       default:
-      case 0: winterface.init = NULL; break;
-      case 1: winterface.reset = NULL; break;
-      case 2: winterface.sync = NULL; break;
-      case 3: winterface.launch = NULL; break;
-      case 4: winterface.execute = NULL; break;
-      case 5: winterface.end = NULL; break;
+      case 0: winterface.init = nullptr; break;
+      case 1: winterface.reset = nullptr; break;
+      case 2: winterface.sync = nullptr; break;
+      case 3: winterface.launch = nullptr; break;
+      case 4: winterface.execute = nullptr; break;
+      case 5: winterface.end = nullptr; break;
     }
     EXPECT_EQ(0, vpx_set_worker_interface(&winterface));
   }
@@ -148,11 +148,6 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
 #if CONFIG_WEBM_IO
-struct FileList {
-  const char *name;
-  const char *expected_md5;
-};
-
 // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
 string DecodeFile(const string &filename, int num_threads) {
   libvpx_test::WebMVideoSource video(filename);
@@ -172,7 +167,7 @@ string DecodeFile(const string &filename, int num_threads) {
     }
 
     libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img = NULL;
+    const vpx_image_t *img = nullptr;
 
     // Get decompressed data
     while ((img = dec_iter.Next())) {
@@ -182,16 +177,6 @@ string DecodeFile(const string &filename, int num_threads) {
   return string(md5.Get());
 }
 
-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 1; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
-          << "threads = " << t;
-    }
-  }
-}
-
 // Trivial serialized thread worker interface implementation.
 // Note any worker that requires synchronization between other workers will
 // hang.
@@ -216,10 +201,6 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) {
   static const VPxWorkerInterface serial_interface = {
     impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
   };
-  // TODO(jzern): Avoid using a file that will use the row-based thread
-  // loopfilter, with the simple serialized implementation it will hang. This is
-  // due to its expectation that rows will be run in parallel as they wait on
-  // progress in the row above before proceeding.
   static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
   static const char filename[] = "vp90-2-03-size-226x226.webm";
   VPxWorkerInterface default_interface = *vpx_get_worker_interface();
@@ -232,88 +213,81 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) {
   EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
 }
 
-TEST(VP9DecodeMultiThreadedTest, NoTilesNonFrameParallel) {
-  // no tiles or frame parallel; this exercises loop filter threading.
-  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
-            DecodeFile("vp90-2-03-size-226x226.webm", 2));
-}
+struct FileParam {
+  const char *name;
+  const char *expected_md5;
+  friend std::ostream &operator<<(std::ostream &os, const FileParam &param) {
+    return os << "file name: " << param.name
+              << " digest: " << param.expected_md5;
+  }
+};
 
-TEST(VP9DecodeMultiThreadedTest, FrameParallel) {
-  static const FileList files[] = { { "vp90-2-08-tile_1x2_frame_parallel.webm",
-                                      "68ede6abd66bae0a2edf2eb9232241b6" },
-                                    { "vp90-2-08-tile_1x4_frame_parallel.webm",
-                                      "368ebc6ebf3a5e478d85b2c3149b2848" },
-                                    { "vp90-2-08-tile_1x8_frame_parallel.webm",
-                                      "17e439da2388aff3a0f69cb22579c6c1" },
-                                    { NULL, NULL } };
+class VP9DecodeMultiThreadedTest : public ::testing::TestWithParam<FileParam> {
+};
 
-  DecodeFiles(files);
+TEST_P(VP9DecodeMultiThreadedTest, Decode) {
+  for (int t = 1; t <= 8; ++t) {
+    EXPECT_EQ(GetParam().expected_md5, DecodeFile(GetParam().name, t))
+        << "threads = " << t;
+  }
 }
 
-TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) {
-  static const FileList files[] = {
-    { "vp90-2-14-resize-fp-tiles-1-16.webm",
-      "0cd5e632c326297e975f38949c31ea94" },
-    { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
-      "5c78a96a42e7f4a4f6b2edcdb791e44c" },
-    { "vp90-2-14-resize-fp-tiles-1-2.webm",
-      "e030450ae85c3277be2a418769df98e2" },
-    { "vp90-2-14-resize-fp-tiles-1-4.webm",
-      "312eed4e2b64eb7a4e7f18916606a430" },
-    { "vp90-2-14-resize-fp-tiles-16-1.webm",
-      "1755c16d8af16a9cb3fe7338d90abe52" },
-    { "vp90-2-14-resize-fp-tiles-16-2.webm",
-      "500300592d3fcb6f12fab25e48aaf4df" },
-    { "vp90-2-14-resize-fp-tiles-16-4.webm",
-      "47c48379fa6331215d91c67648e1af6e" },
-    { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
-      "eecf17290739bc708506fa4827665989" },
-    { "vp90-2-14-resize-fp-tiles-16-8.webm",
-      "29b6bb54e4c26b5ca85d5de5fed94e76" },
-    { "vp90-2-14-resize-fp-tiles-1-8.webm",
-      "1b6f175e08cd82cf84bb800ac6d1caa3" },
-    { "vp90-2-14-resize-fp-tiles-2-16.webm",
-      "ca3b03e4197995d8d5444ede7a6c0804" },
-    { "vp90-2-14-resize-fp-tiles-2-1.webm",
-      "99aec065369d70bbb78ccdff65afed3f" },
-    { "vp90-2-14-resize-fp-tiles-2-4.webm",
-      "22d0ebdb49b87d2920a85aea32e1afd5" },
-    { "vp90-2-14-resize-fp-tiles-2-8.webm",
-      "c2115cf051c62e0f7db1d4a783831541" },
-    { "vp90-2-14-resize-fp-tiles-4-16.webm",
-      "c690d7e1719b31367564cac0af0939cb" },
-    { "vp90-2-14-resize-fp-tiles-4-1.webm",
-      "a926020b2cc3e15ad4cc271853a0ff26" },
-    { "vp90-2-14-resize-fp-tiles-4-2.webm",
-      "42699063d9e581f1993d0cf890c2be78" },
-    { "vp90-2-14-resize-fp-tiles-4-8.webm",
-      "7f76d96036382f45121e3d5aa6f8ec52" },
-    { "vp90-2-14-resize-fp-tiles-8-16.webm",
-      "76a43fcdd7e658542913ea43216ec55d" },
-    { "vp90-2-14-resize-fp-tiles-8-1.webm",
-      "8e3fbe89486ca60a59299dea9da91378" },
-    { "vp90-2-14-resize-fp-tiles-8-2.webm",
-      "ae96f21f21b6370cc0125621b441fc52" },
-    { "vp90-2-14-resize-fp-tiles-8-4.webm",
-      "3eb4f24f10640d42218f7fd7b9fd30d4" },
-    { NULL, NULL }
-  };
+const FileParam kNoTilesNonFrameParallelFiles[] = {
+  { "vp90-2-03-size-226x226.webm", "b35a1b707b28e82be025d960aba039bc" }
+};
 
-  DecodeFiles(files);
-}
+const FileParam kFrameParallelFiles[] = {
+  { "vp90-2-08-tile_1x2_frame_parallel.webm",
+    "68ede6abd66bae0a2edf2eb9232241b6" },
+  { "vp90-2-08-tile_1x4_frame_parallel.webm",
+    "368ebc6ebf3a5e478d85b2c3149b2848" },
+  { "vp90-2-08-tile_1x8_frame_parallel.webm",
+    "17e439da2388aff3a0f69cb22579c6c1" },
+};
 
-TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) {
-  static const FileList files[] = {
-    { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
-    { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
-    { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
-    { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
-    { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
-    { NULL, NULL }
-  };
+const FileParam kFrameParallelResizeFiles[] = {
+  { "vp90-2-14-resize-fp-tiles-1-16.webm", "0cd5e632c326297e975f38949c31ea94" },
+  { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
+    "5c78a96a42e7f4a4f6b2edcdb791e44c" },
+  { "vp90-2-14-resize-fp-tiles-1-2.webm", "e030450ae85c3277be2a418769df98e2" },
+  { "vp90-2-14-resize-fp-tiles-1-4.webm", "312eed4e2b64eb7a4e7f18916606a430" },
+  { "vp90-2-14-resize-fp-tiles-16-1.webm", "1755c16d8af16a9cb3fe7338d90abe52" },
+  { "vp90-2-14-resize-fp-tiles-16-2.webm", "500300592d3fcb6f12fab25e48aaf4df" },
+  { "vp90-2-14-resize-fp-tiles-16-4.webm", "47c48379fa6331215d91c67648e1af6e" },
+  { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
+    "eecf17290739bc708506fa4827665989" },
+  { "vp90-2-14-resize-fp-tiles-16-8.webm", "29b6bb54e4c26b5ca85d5de5fed94e76" },
+  { "vp90-2-14-resize-fp-tiles-1-8.webm", "1b6f175e08cd82cf84bb800ac6d1caa3" },
+  { "vp90-2-14-resize-fp-tiles-2-16.webm", "ca3b03e4197995d8d5444ede7a6c0804" },
+  { "vp90-2-14-resize-fp-tiles-2-1.webm", "99aec065369d70bbb78ccdff65afed3f" },
+  { "vp90-2-14-resize-fp-tiles-2-4.webm", "22d0ebdb49b87d2920a85aea32e1afd5" },
+  { "vp90-2-14-resize-fp-tiles-2-8.webm", "c2115cf051c62e0f7db1d4a783831541" },
+  { "vp90-2-14-resize-fp-tiles-4-16.webm", "c690d7e1719b31367564cac0af0939cb" },
+  { "vp90-2-14-resize-fp-tiles-4-1.webm", "a926020b2cc3e15ad4cc271853a0ff26" },
+  { "vp90-2-14-resize-fp-tiles-4-2.webm", "42699063d9e581f1993d0cf890c2be78" },
+  { "vp90-2-14-resize-fp-tiles-4-8.webm", "7f76d96036382f45121e3d5aa6f8ec52" },
+  { "vp90-2-14-resize-fp-tiles-8-16.webm", "76a43fcdd7e658542913ea43216ec55d" },
+  { "vp90-2-14-resize-fp-tiles-8-1.webm", "8e3fbe89486ca60a59299dea9da91378" },
+  { "vp90-2-14-resize-fp-tiles-8-2.webm", "ae96f21f21b6370cc0125621b441fc52" },
+  { "vp90-2-14-resize-fp-tiles-8-4.webm", "3eb4f24f10640d42218f7fd7b9fd30d4" },
+};
 
-  DecodeFiles(files);
-}
+const FileParam kNonFrameParallelFiles[] = {
+  { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
+  { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
+  { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
+  { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
+  { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
+};
+
+INSTANTIATE_TEST_SUITE_P(NoTilesNonFrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kNoTilesNonFrameParallelFiles));
+INSTANTIATE_TEST_SUITE_P(FrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kFrameParallelFiles));
+INSTANTIATE_TEST_SUITE_P(FrameParallelResize, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kFrameParallelResizeFiles));
+INSTANTIATE_TEST_SUITE_P(NonFrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kNonFrameParallelFiles));
 #endif  // CONFIG_WEBM_IO
 
 INSTANTIATE_TEST_SUITE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());
diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc
index 7eea437fc8..3897a6088d 100644
--- a/test/vpx_scale_test.cc
+++ b/test/vpx_scale_test.cc
@@ -38,10 +38,10 @@ class ExtendBorderTest
     : public VpxScaleBase,
       public ::testing::TestWithParam<ExtendFrameBorderFunc> {
  public:
-  virtual ~ExtendBorderTest() {}
+  ~ExtendBorderTest() override = default;
 
  protected:
-  virtual void SetUp() { extend_fn_ = GetParam(); }
+  void SetUp() override { extend_fn_ = GetParam(); }
 
   void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); }
 
@@ -68,10 +68,10 @@ INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest,
 class CopyFrameTest : public VpxScaleBase,
                       public ::testing::TestWithParam<CopyFrameFunc> {
  public:
-  virtual ~CopyFrameTest() {}
+  ~CopyFrameTest() override = default;
 
  protected:
-  virtual void SetUp() { copy_frame_fn_ = GetParam(); }
+  void SetUp() override { copy_frame_fn_ = GetParam(); }
 
   void CopyFrame() {
     ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_));
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 6f55f7db7c..6ab50c849f 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -26,21 +26,21 @@ class WebMVideoSource : public CompressedVideoSource {
  public:
   explicit WebMVideoSource(const std::string &file_name)
       : file_name_(file_name), vpx_ctx_(new VpxInputContext()),
-        webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_(0),
+        webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0),
         end_of_file_(false) {}
 
-  virtual ~WebMVideoSource() {
-    if (vpx_ctx_->file != NULL) fclose(vpx_ctx_->file);
+  ~WebMVideoSource() override {
+    if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file);
     webm_free(webm_ctx_);
     delete vpx_ctx_;
     delete webm_ctx_;
   }
 
-  virtual void Init() {}
+  void Init() override {}
 
-  virtual void Begin() {
+  void Begin() override {
     vpx_ctx_->file = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(vpx_ctx_->file != NULL)
+    ASSERT_NE(vpx_ctx_->file, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM";
@@ -48,13 +48,13 @@ class WebMVideoSource : public CompressedVideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
   void FillFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
     ASSERT_GE(status, 0) << "webm_read_frame failed";
     if (status == 1) {
@@ -63,7 +63,7 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   void SeekToNextKeyFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     do {
       const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
       ASSERT_GE(status, 0) << "webm_read_frame failed";
@@ -74,9 +74,11 @@ class WebMVideoSource : public CompressedVideoSource {
     } while (!webm_ctx_->is_key_frame && !end_of_file_);
   }
 
-  virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; }
-  virtual size_t frame_size() const { return buf_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  const uint8_t *cxdata() const override {
+    return end_of_file_ ? nullptr : buf_;
+  }
+  size_t frame_size() const override { return buf_sz_; }
+  unsigned int frame_number() const override { return frame_; }
 
  protected:
   std::string file_name_;
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index e7b86ac500..78a944fd08 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -78,7 +78,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
  protected:
   Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
 
-  virtual ~Y4mVideoSourceTest() { CloseSource(); }
+  ~Y4mVideoSourceTest() override { CloseSource(); }
 
   virtual void Init(const std::string &file_name, int limit) {
     file_name_ = file_name;
@@ -90,7 +90,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
 
   // Checks y4m header information
   void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     ASSERT_EQ(y4m_.pic_w, (int)kWidth);
     ASSERT_EQ(y4m_.pic_h, (int)kHeight);
     ASSERT_EQ(img()->d_w, kWidth);
@@ -116,7 +116,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
 
   // Checks MD5 of the raw frame data
   void Md5Check(const string &expected_md5) {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     libvpx_test::MD5 md5;
     for (unsigned int i = start_; i < limit_; i++) {
       md5.Add(img());
@@ -138,11 +138,11 @@ INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest,
 
 class Y4mVideoWriteTest : public Y4mVideoSourceTest {
  protected:
-  Y4mVideoWriteTest() : tmpfile_(NULL) {}
+  Y4mVideoWriteTest() : tmpfile_(nullptr) {}
 
-  virtual ~Y4mVideoWriteTest() {
+  ~Y4mVideoWriteTest() override {
     delete tmpfile_;
-    input_file_ = NULL;
+    input_file_ = nullptr;
   }
 
   void ReplaceInputFile(FILE *input_file) {
@@ -155,11 +155,11 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
 
   // Writes out a y4m file and then reads it back
   void WriteY4mAndReadBack() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     char buf[Y4M_BUFFER_SIZE] = { 0 };
     const struct VpxRational framerate = { y4m_.fps_n, y4m_.fps_d };
     tmpfile_ = new libvpx_test::TempOutFile;
-    ASSERT_TRUE(tmpfile_->file() != NULL);
+    ASSERT_NE(tmpfile_->file(), nullptr);
     y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate,
                           y4m_.vpx_fmt, y4m_.bit_depth);
     fputs(buf, tmpfile_->file());
@@ -172,7 +172,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
     ReplaceInputFile(tmpfile_->file());
   }
 
-  virtual void Init(const std::string &file_name, int limit) {
+  void Init(const std::string &file_name, int limit) override {
     Y4mVideoSourceTest::Init(file_name, limit);
     WriteY4mAndReadBack();
   }
@@ -188,4 +188,57 @@ TEST_P(Y4mVideoWriteTest, WriteTest) {
 
 INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest,
                          ::testing::ValuesIn(kY4mTestVectors));
+
+static const char kY4MRegularHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, RegularHeader) {
+  libvpx_test::TempOutFile f;
+  ASSERT_NE(f.file(), nullptr);
+  fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file());
+  fflush(f.file());
+  EXPECT_EQ(0, fseek(f.file(), 0, 0));
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
+                           /*num_skip=*/0, /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
+// Testing that headers over 100 characters can be parsed.
+static const char kY4MLongHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG "
+    "XCOLORRANGE=LIMITED XSOME_UNKNOWN_METADATA XOTHER_UNKNOWN_METADATA\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, LongHeader) {
+  libvpx_test::TempOutFile f;
+  ASSERT_NE(f.file(), nullptr);
+  fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file());
+  fflush(f.file());
+  EXPECT_EQ(fseek(f.file(), 0, 0), 0);
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
+                           /*num_skip=*/0, /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
 }  // namespace
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index 89aa2a44fc..e43e37d9e4 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -23,11 +23,11 @@ namespace libvpx_test {
 class Y4mVideoSource : public VideoSource {
  public:
   Y4mVideoSource(const std::string &file_name, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(new vpx_image_t()),
+      : file_name_(file_name), input_file_(nullptr), img_(new vpx_image_t()),
         start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
         framerate_denominator_(0), y4m_() {}
 
-  virtual ~Y4mVideoSource() {
+  ~Y4mVideoSource() override {
     vpx_img_free(img_.get());
     CloseSource();
   }
@@ -35,13 +35,13 @@ class Y4mVideoSource : public VideoSource {
   virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
   }
 
   virtual void ReadSourceToStart() {
-    ASSERT_TRUE(input_file_ != NULL);
-    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
+    ASSERT_NE(input_file_, nullptr);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, nullptr, 0, 0));
     framerate_numerator_ = y4m_.fps_n;
     framerate_denominator_ = y4m_.fps_d;
     frame_ = 0;
@@ -51,36 +51,36 @@ class Y4mVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual void Begin() {
+  void Begin() override {
     OpenSource();
     ReadSourceToStart();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const {
-    return (frame_ < limit_) ? img_.get() : NULL;
+  vpx_image_t *img() const override {
+    return (frame_ < limit_) ? img_.get() : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
   }
@@ -101,9 +101,9 @@ class Y4mVideoSource : public VideoSource {
   void CloseSource() {
     y4m_input_close(&y4m_);
     y4m_ = y4m_input();
-    if (input_file_ != NULL) {
+    if (input_file_ != nullptr) {
       fclose(input_file_);
-      input_file_ = NULL;
+      input_file_ = nullptr;
     }
   }
 
diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc
index cfdc88d896..0677d55688 100644
--- a/test/yuv_temporal_filter_test.cc
+++ b/test/yuv_temporal_filter_test.cc
@@ -290,7 +290,7 @@ void ApplyReferenceFilter(
 class YUVTemporalFilterTest
     : public ::testing::TestWithParam<TemporalFilterWithBd> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     filter_func_ = GetParam().temporal_filter;
     bd_ = GetParam().bd;
     use_highbd_ = (bd_ != 8);
@@ -674,8 +674,8 @@ TEST_P(YUVTemporalFilterTest, DISABLED_Speed) {
          v_count);                                                            \
   }
 
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10);
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12);
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12)
 
 INSTANTIATE_TEST_SUITE_P(
     C, YUVTemporalFilterTest,
@@ -683,8 +683,8 @@ INSTANTIATE_TEST_SUITE_P(
         TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10),
         TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12)));
 #if HAVE_SSE4_1
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10);
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12);
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12)
 
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, YUVTemporalFilterTest,
@@ -694,6 +694,18 @@ INSTANTIATE_TEST_SUITE_P(
         TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12,
                              12)));
 #endif  // HAVE_SSE4_1
+#if HAVE_NEON
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 12)
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_10,
+                             10),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_12,
+                             12)));
+#endif  // HAVE_NEON
 #else
 INSTANTIATE_TEST_SUITE_P(
     C, YUVTemporalFilterTest,
@@ -704,5 +716,11 @@ INSTANTIATE_TEST_SUITE_P(SSE4_1, YUVTemporalFilterTest,
                          ::testing::Values(TemporalFilterWithBd(
                              &vp9_apply_temporal_filter_sse4_1, 8)));
 #endif  // HAVE_SSE4_1
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, YUVTemporalFilterTest,
+                         ::testing::Values(TemporalFilterWithBd(
+                             &vp9_apply_temporal_filter_neon, 8)));
+#endif  // HAVE_NEON
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
 }  // namespace
diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h
index 383ab8f1b1..bb5eec5bb8 100644
--- a/test/yuv_video_source.h
+++ b/test/yuv_video_source.h
@@ -27,23 +27,23 @@ class YUVVideoSource : public VideoSource {
   YUVVideoSource(const std::string &file_name, vpx_img_fmt format,
                  unsigned int width, unsigned int height, int rate_numerator,
                  int rate_denominator, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start),
-        limit_(limit), frame_(0), width_(0), height_(0),
+      : file_name_(file_name), input_file_(nullptr), img_(nullptr),
+        start_(start), limit_(limit), frame_(0), width_(0), height_(0),
         format_(VPX_IMG_FMT_NONE), framerate_numerator_(rate_numerator),
         framerate_denominator_(rate_denominator) {
     // This initializes format_, raw_size_, width_, height_ and allocates img.
     SetSize(width, height, format);
   }
 
-  virtual ~YUVVideoSource() {
+  ~YUVVideoSource() override {
     vpx_img_free(img_);
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Begin() {
+  void Begin() override {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
     if (start_) {
       fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
@@ -53,33 +53,35 @@ class YUVVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  vpx_image_t *img() const override {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void SetSize(unsigned int width, unsigned int height,
                        vpx_img_fmt format) {
     if (width != width_ || height != height_ || format != format_) {
       vpx_img_free(img_);
-      img_ = vpx_img_alloc(NULL, format, width, height, 1);
-      ASSERT_TRUE(img_ != NULL);
+      img_ = vpx_img_alloc(nullptr, format, width, height, 1);
+      ASSERT_NE(img_, nullptr);
       width_ = width;
       height_ = height;
       format_ = format;
@@ -99,7 +101,7 @@ class YUVVideoSource : public VideoSource {
   }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
       limit_ = frame_;
diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx
index 111390a12e..5f6b01b0ec 100644
--- a/third_party/googletest/README.libvpx
+++ b/third_party/googletest/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://github.com/google/googletest.git
-Version: release-1.10.0
+Version: release-1.12.1
 License: BSD
 License File: LICENSE
 
@@ -13,10 +13,17 @@ generation.
 
 Local Modifications:
 - Remove everything but:
-  googletest-release-1.10.0/googletest/
-   CHANGES
-   CONTRIBUTORS
+  .clang-format
+  CONTRIBUTORS
+  googletest/
    include
-   LICENSE
    README.md
    src
+  LICENSE
+- Move .clang-format, CONTRIBUTORS, and LICENSE into googletest/
+- In googletest/include/gtest/internal/custom/gtest-port.h, define
+  GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix
+  the mingw32 g++ compilation errors caused by the lack of std::mutex
+  and std::condition_variable in the <mutex> and <condition_variable>
+  headers if mingw32 is configured with the win32 threads option. See
+  https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32
diff --git a/third_party/googletest/src/.clang-format b/third_party/googletest/src/.clang-format
new file mode 100644
index 0000000000..5b9bfe6d22
--- /dev/null
+++ b/third_party/googletest/src/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+Language:        Cpp
+BasedOnStyle:  Google
diff --git a/third_party/googletest/src/CHANGES b/third_party/googletest/src/CHANGES
deleted file mode 100644
index 0552132421..0000000000
--- a/third_party/googletest/src/CHANGES
+++ /dev/null
@@ -1,157 +0,0 @@
-Changes for 1.7.0:
-
-* New feature: death tests are supported on OpenBSD and in iOS
-  simulator now.
-* New feature: Google Test now implements a protocol to allow
-  a test runner to detect that a test program has exited
-  prematurely and report it as a failure (before it would be
-  falsely reported as a success if the exit code is 0).
-* New feature: Test::RecordProperty() can now be used outside of the
-  lifespan of a test method, in which case it will be attributed to
-  the current test case or the test program in the XML report.
-* New feature (potentially breaking): --gtest_list_tests now prints
-  the type parameters and value parameters for each test.
-* Improvement: char pointers and char arrays are now escaped properly
-  in failure messages.
-* Improvement: failure summary in XML reports now includes file and
-  line information.
-* Improvement: the <testsuites> XML element now has a timestamp attribute.
-* Improvement: When --gtest_filter is specified, XML report now doesn't
-  contain information about tests that are filtered out.
-* Fixed the bug where long --gtest_filter flag values are truncated in
-  death tests.
-* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a
-  function instead of a macro in order to work better with Clang.
-* Compatibility fixes with C++ 11 and various platforms.
-* Bug/warning fixes.
-
-Changes for 1.6.0:
-
-* New feature: ADD_FAILURE_AT() for reporting a test failure at the
-  given source location -- useful for writing testing utilities.
-* New feature: the universal value printer is moved from Google Mock
-  to Google Test.
-* New feature: type parameters and value parameters are reported in
-  the XML report now.
-* A gtest_disable_pthreads CMake option.
-* Colored output works in GNU Screen sessions now.
-* Parameters of value-parameterized tests are now printed in the
-  textual output.
-* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are
-  now correctly reported.
-* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to
-  ostream.
-* More complete handling of exceptions.
-* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter
-  name is already used by another library.
-* --gtest_catch_exceptions is now true by default, allowing a test
-  program to continue after an exception is thrown.
-* Value-parameterized test fixtures can now derive from Test and
-  WithParamInterface<T> separately, easing conversion of legacy tests.
-* Death test messages are clearly marked to make them more
-  distinguishable from other messages.
-* Compatibility fixes for Android, Google Native Client, MinGW, HP UX,
-  PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear),
-  IBM XL C++ (Visual Age C++), and C++0x.
-* Bug fixes and implementation clean-ups.
-* Potentially incompatible changes: disables the harmful 'make install'
-  command in autotools.
-
-Changes for 1.5.0:
-
- * New feature: assertions can be safely called in multiple threads
-   where the pthreads library is available.
- * New feature: predicates used inside EXPECT_TRUE() and friends
-   can now generate custom failure messages.
- * New feature: Google Test can now be compiled as a DLL.
- * New feature: fused source files are included.
- * New feature: prints help when encountering unrecognized Google Test flags.
- * Experimental feature: CMake build script (requires CMake 2.6.4+).
- * Experimental feature: the Pump script for meta programming.
- * double values streamed to an assertion are printed with enough precision
-   to differentiate any two different values.
- * Google Test now works on Solaris and AIX.
- * Build and test script improvements.
- * Bug fixes and implementation clean-ups.
-
- Potentially breaking changes:
-
- * Stopped supporting VC++ 7.1 with exceptions disabled.
- * Dropped support for 'make install'.
-
-Changes for 1.4.0:
-
- * New feature: the event listener API
- * New feature: test shuffling
- * New feature: the XML report format is closer to junitreport and can
-   be parsed by Hudson now.
- * New feature: when a test runs under Visual Studio, its failures are
-   integrated in the IDE.
- * New feature: /MD(d) versions of VC++ projects.
- * New feature: elapsed time for the tests is printed by default.
- * New feature: comes with a TR1 tuple implementation such that Boost
-   is no longer needed for Combine().
- * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends.
- * New feature: the Xcode project can now produce static gtest
-   libraries in addition to a framework.
- * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile,
-   Symbian, gcc, and C++Builder.
- * Bug fixes and implementation clean-ups.
-
-Changes for 1.3.0:
-
- * New feature: death tests on Windows, Cygwin, and Mac.
- * New feature: ability to use Google Test assertions in other testing
-   frameworks.
- * New feature: ability to run disabled test via
-   --gtest_also_run_disabled_tests.
- * New feature: the --help flag for printing the usage.
- * New feature: access to Google Test flag values in user code.
- * New feature: a script that packs Google Test into one .h and one
-   .cc file for easy deployment.
- * New feature: support for distributing test functions to multiple
-   machines (requires support from the test runner).
- * Bug fixes and implementation clean-ups.
-
-Changes for 1.2.1:
-
- * Compatibility fixes for Linux IA-64 and IBM z/OS.
- * Added support for using Boost and other TR1 implementations.
- * Changes to the build scripts to support upcoming release of Google C++
-   Mocking Framework.
- * Added Makefile to the distribution package.
- * Improved build instructions in README.
-
-Changes for 1.2.0:
-
- * New feature: value-parameterized tests.
- * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS)
-   macros.
- * Changed the XML report format to match JUnit/Ant's.
- * Added tests to the Xcode project.
- * Added scons/SConscript for building with SCons.
- * Added src/gtest-all.cc for building Google Test from a single file.
- * Fixed compatibility with Solaris and z/OS.
- * Enabled running Python tests on systems with python 2.3 installed,
-   e.g. Mac OS X 10.4.
- * Bug fixes.
-
-Changes for 1.1.0:
-
- * New feature: type-parameterized tests.
- * New feature: exception assertions.
- * New feature: printing elapsed time of tests.
- * Improved the robustness of death tests.
- * Added an Xcode project and samples.
- * Adjusted the output format on Windows to be understandable by Visual Studio.
- * Minor bug fixes.
-
-Changes for 1.0.1:
-
- * Added project files for Visual Studio 7.1.
- * Fixed issues with compiling on Mac OS X.
- * Fixed issues with compiling on Cygwin.
-
-Changes for 1.0.0:
-
- * Initial Open Source release of Google Test
diff --git a/third_party/googletest/src/CONTRIBUTORS b/third_party/googletest/src/CONTRIBUTORS
index feae2fc044..77397a5b53 100644
--- a/third_party/googletest/src/CONTRIBUTORS
+++ b/third_party/googletest/src/CONTRIBUTORS
@@ -5,33 +5,61 @@
 
 Ajay Joshi <jaj@google.com>
 Balázs Dán <balazs.dan@gmail.com>
+Benoit Sigoure <tsuna@google.com>
 Bharat Mediratta <bharat@menalto.com>
+Bogdan Piloca <boo@google.com>
 Chandler Carruth <chandlerc@google.com>
 Chris Prince <cprince@google.com>
 Chris Taylor <taylorc@google.com>
 Dan Egnor <egnor@google.com>
+Dave MacLachlan <dmaclach@gmail.com>
+David Anderson <danderson@google.com>
+Dean Sturtevant
 Eric Roman <eroman@chromium.org>
+Gene Volovich <gv@cite.com>
 Hady Zalek <hady.zalek@gmail.com>
+Hal Burch <gmock@hburch.com>
 Jeffrey Yasskin <jyasskin@google.com>
+Jim Keller <jimkeller@google.com>
+Joe Walnes <joe@truemesh.com>
+Jon Wray <jwray@google.com>
 Jói Sigurðsson <joi@google.com>
 Keir Mierle <mierle@gmail.com>
 Keith Ray <keith.ray@gmail.com>
 Kenton Varda <kenton@google.com>
+Kostya Serebryany <kcc@google.com>
+Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
+Lev Makhlis
 Manuel Klimek <klimek@google.com>
+Mario Tanev <radix@google.com>
+Mark Paskin
 Markus Heule <markus.heule@gmail.com>
+Martijn Vels <mvels@google.com>
+Matthew Simmons <simmonmt@acm.org>
 Mika Raento <mikie@iki.fi>
+Mike Bland <mbland@google.com>
 Miklós Fazekas <mfazekas@szemafor.com>
+Neal Norwitz <nnorwitz@gmail.com>
+Nermin Ozkiranartli <nermin@google.com>
+Owen Carlsen <ocarlsen@google.com>
+Paneendra Ba <paneendra@google.com>
 Pasi Valminen <pasi.valminen@gmail.com>
 Patrick Hanna <phanna@google.com>
 Patrick Riley <pfr@google.com>
+Paul Menage <menage@google.com>
 Peter Kaminski <piotrk@google.com>
+Piotr Kaminski <piotrk@google.com>
 Preston Jackson <preston.a.jackson@gmail.com>
 Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
 Russ Cox <rsc@google.com>
 Russ Rufer <russ@pentad.com>
 Sean Mcafee <eefacm@gmail.com>
 Sigurður Ásgeirsson <siggi@google.com>
+Sverre Sundsdal <sundsdal@gmail.com>
+Szymon Sobik <sobik.szymon@gmail.com>
+Takeshi Yoshino <tyoshino@google.com>
 Tracy Bialik <tracy@pentad.com>
 Vadim Berman <vadimb@google.com>
 Vlad Losev <vladl@google.com>
+Wolfgang Klier <wklier@google.com>
 Zhanyong Wan <wan@google.com>
diff --git a/third_party/googletest/src/README.md b/third_party/googletest/src/README.md
index 766ddc1e07..d26b309ed0 100644
--- a/third_party/googletest/src/README.md
+++ b/third_party/googletest/src/README.md
@@ -2,39 +2,51 @@
 
 #### Setup
 
-To build Google Test and your tests that use it, you need to tell your build
+To build GoogleTest and your tests that use it, you need to tell your build
 system where to find its headers and source files. The exact way to do it
 depends on which build system you use, and is usually straightforward.
 
 ### Build with CMake
 
-Google Test comes with a CMake build script (
-[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
+GoogleTest comes with a CMake build script
+([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
 that can be used on a wide range of platforms ("C" stands for cross-platform.).
 If you don't have CMake installed already, you can download it for free from
 <http://www.cmake.org/>.
 
 CMake works by generating native makefiles or build projects that can be used in
-the compiler environment of your choice. You can either build Google Test as a
+the compiler environment of your choice. You can either build GoogleTest as a
 standalone project or it can be incorporated into an existing CMake build for
 another project.
 
 #### Standalone CMake Project
 
-When building Google Test as a standalone project, the typical workflow starts
-with:
+When building GoogleTest as a standalone project, the typical workflow starts
+with
 
-    mkdir mybuild       # Create a directory to hold the build output.
-    cd mybuild
-    cmake ${GTEST_DIR}  # Generate native build scripts.
+```
+git clone https://github.com/google/googletest.git -b release-1.11.0
+cd googletest        # Main directory of the cloned repository.
+mkdir build          # Create a directory to hold the build output.
+cd build
+cmake ..             # Generate native build scripts for GoogleTest.
+```
 
-If you want to build Google Test's samples, you should replace the last command
-with
+The above command also includes GoogleMock by default. And so, if you want to
+build only GoogleTest, you should replace the last command with
 
-    cmake -Dgtest_build_samples=ON ${GTEST_DIR}
+```
+cmake .. -DBUILD_GMOCK=OFF
+```
 
 If you are on a \*nix system, you should now see a Makefile in the current
-directory. Just type 'make' to build gtest.
+directory. Just type `make` to build GoogleTest. And then you can simply install
+GoogleTest if you are a system administrator.
+
+```
+make
+sudo make install    # Install in /usr/local/ by default
+```
 
 If you use Windows and have Visual Studio installed, a `gtest.sln` file and
 several `.vcproj` files will be created. You can then build them using Visual
@@ -44,13 +56,19 @@ On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
 
 #### Incorporating Into An Existing CMake Project
 
-If you want to use gtest in a project which already uses CMake, then a more
-robust and flexible approach is to build gtest as part of that project directly.
-This is done by making the GoogleTest source code available to the main build
-and adding it using CMake's `add_subdirectory()` command. This has the
-significant advantage that the same compiler and linker settings are used
-between gtest and the rest of your project, so issues associated with using
-incompatible libraries (eg debug/release), etc. are avoided. This is
+If you want to use GoogleTest in a project which already uses CMake, the easiest
+way is to get installed libraries and headers.
+
+*   Import GoogleTest by using `find_package` (or `pkg_check_modules`). For
+    example, if `find_package(GTest CONFIG REQUIRED)` succeeds, you can use the
+    libraries as `GTest::gtest`, `GTest::gmock`.
+
+And a more robust and flexible approach is to build GoogleTest as part of that
+project directly. This is done by making the GoogleTest source code available to
+the main build and adding it using CMake's `add_subdirectory()` command. This
+has the significant advantage that the same compiler and linker settings are
+used between GoogleTest and the rest of your project, so issues associated with
+using incompatible libraries (eg debug/release), etc. are avoided. This is
 particularly useful on Windows. Making GoogleTest's source code available to the
 main build can be done a few different ways:
 
@@ -64,68 +82,23 @@ main build can be done a few different ways:
     possible or appropriate. Git submodules, for example, have their own set of
     advantages and drawbacks.
 *   Use CMake to download GoogleTest as part of the build's configure step. This
-    is just a little more complex, but doesn't have the limitations of the other
-    methods.
+    approach doesn't have the limitations of the other methods.
 
-The last of the above methods is implemented with a small piece of CMake code in
-a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and
-then invoked as a sub-build _during the CMake stage_. That directory is then
-pulled into the main build with `add_subdirectory()`. For example:
+The last of the above methods is implemented with a small piece of CMake code
+that downloads and pulls the GoogleTest code into the main build.
 
-New file `CMakeLists.txt.in`:
+Just add to your `CMakeLists.txt`:
 
 ```cmake
-cmake_minimum_required(VERSION 2.8.2)
-
-project(googletest-download NONE)
-
-include(ExternalProject)
-ExternalProject_Add(googletest
-  GIT_REPOSITORY    https://github.com/google/googletest.git
-  GIT_TAG           master
-  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
-  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  # Specify the commit you depend on and update it regularly.
+  URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
 )
-```
-
-Existing build's `CMakeLists.txt`:
-
-```cmake
-# Download and unpack googletest at configure time
-configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
-execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-  RESULT_VARIABLE result
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
-if(result)
-  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
-endif()
-execute_process(COMMAND ${CMAKE_COMMAND} --build .
-  RESULT_VARIABLE result
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
-if(result)
-  message(FATAL_ERROR "Build step for googletest failed: ${result}")
-endif()
-
-# Prevent overriding the parent project's compiler/linker
-# settings on Windows
+# For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-
-# Add googletest directly to our build. This defines
-# the gtest and gtest_main targets.
-add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
-                 ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
-                 EXCLUDE_FROM_ALL)
-
-# The gtest/gtest_main targets carry header search path
-# dependencies automatically when using CMake 2.8.11 or
-# later. Otherwise we have to add them here ourselves.
-if (CMAKE_VERSION VERSION_LESS 2.8.11)
-  include_directories("${gtest_SOURCE_DIR}/include")
-endif()
+FetchContent_MakeAvailable(googletest)
 
 # Now simply link against gtest or gtest_main as needed. Eg
 add_executable(example example.cpp)
@@ -133,20 +106,18 @@ target_link_libraries(example gtest_main)
 add_test(NAME example_test COMMAND example)
 ```
 
-Note that this approach requires CMake 2.8.2 or later due to its use of the
-`ExternalProject_Add()` command. The above technique is discussed in more detail
-in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which
-also contains a link to a fully generalized implementation of the technique.
+Note that this approach requires CMake 3.14 or later due to its use of the
+`FetchContent_MakeAvailable()` command.
 
 ##### Visual Studio Dynamic vs Static Runtimes
 
 By default, new Visual Studio projects link the C runtimes dynamically but
-Google Test links them statically. This will generate an error that looks
+GoogleTest links them statically. This will generate an error that looks
 something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
 detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
 'MDd_DynamicDebug' in main.obj
 
-Google Test already has a CMake option for this: `gtest_force_shared_crt`
+GoogleTest already has a CMake option for this: `gtest_force_shared_crt`
 
 Enabling this option will make gtest link the runtimes dynamically too, and
 match the project in which it is included.
@@ -154,17 +125,17 @@ match the project in which it is included.
 #### C++ Standard Version
 
 An environment that supports C++11 is required in order to successfully build
-Google Test. One way to ensure this is to specify the standard in the top-level
+GoogleTest. One way to ensure this is to specify the standard in the top-level
 project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this
-is not feasible, for example in a C project using Google Test for validation,
+is not feasible, for example in a C project using GoogleTest for validation,
 then it can be specified by adding it to the options for cmake via the
 `DCMAKE_CXX_FLAGS` option.
 
-### Tweaking Google Test
+### Tweaking GoogleTest
 
-Google Test can be used in diverse environments. The default configuration may
+GoogleTest can be used in diverse environments. The default configuration may
 not work (or may not work well) out of the box in some environments. However,
-you can easily tweak Google Test by defining control macros on the compiler
+you can easily tweak GoogleTest by defining control macros on the compiler
 command line. Generally, these macros are named like `GTEST_XYZ` and you define
 them to either 1 or 0 to enable or disable a certain feature.
 
@@ -173,12 +144,12 @@ We list the most frequently used macros below. For a complete list, see file
 
 ### Multi-threaded Tests
 
-Google Test is thread-safe where the pthread library is available. After
+GoogleTest is thread-safe where the pthread library is available. After
 `#include "gtest/gtest.h"`, you can check the
 `GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is
 `#defined` to 1, no if it's undefined.).
 
-If Google Test doesn't correctly detect whether pthread is available in your
+If GoogleTest doesn't correctly detect whether pthread is available in your
 environment, you can force it with
 
     -DGTEST_HAS_PTHREAD=1
@@ -187,16 +158,16 @@ or
 
     -DGTEST_HAS_PTHREAD=0
 
-When Google Test uses pthread, you may need to add flags to your compiler and/or
+When GoogleTest uses pthread, you may need to add flags to your compiler and/or
 linker to select the pthread library, or you'll get link errors. If you use the
-CMake script or the deprecated Autotools script, this is taken care of for you.
-If you use your own build script, you'll need to read your compiler and linker's
-manual to figure out what flags to add.
+CMake script, this is taken care of for you. If you use your own build script,
+you'll need to read your compiler and linker's manual to figure out what flags
+to add.
 
 ### As a Shared Library (DLL)
 
-Google Test is compact, so most users can build and link it as a static library
-for the simplicity. You can choose to use Google Test as a shared library (known
+GoogleTest is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use GoogleTest as a shared library (known
 as a DLL on Windows) if you prefer.
 
 To compile *gtest* as a shared library, add
@@ -216,23 +187,25 @@ Note: while the above steps aren't technically necessary today when using some
 compilers (e.g. GCC), they may become necessary in the future, if we decide to
 improve the speed of loading the library (see
 <http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
-to always add the above flags when using Google Test as a shared library.
-Otherwise a future release of Google Test may break your build script.
+to always add the above flags when using GoogleTest as a shared library.
+Otherwise a future release of GoogleTest may break your build script.
 
 ### Avoiding Macro Name Clashes
 
 In C++, macros don't obey namespaces. Therefore two libraries that both define a
 macro of the same name will clash if you `#include` both definitions. In case a
-Google Test macro clashes with another library, you can force Google Test to
+GoogleTest macro clashes with another library, you can force GoogleTest to
 rename its macro to avoid the conflict.
 
-Specifically, if both Google Test and some other code define macro FOO, you can
+Specifically, if both GoogleTest and some other code define macro FOO, you can
 add
 
     -DGTEST_DONT_DEFINE_FOO=1
 
-to the compiler flags to tell Google Test to change the macro's name from `FOO`
-to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
+to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
+to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`,
+`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`,
+`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For
 example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
 
     GTEST_TEST(SomeTest, DoesThis) { ... }
diff --git a/third_party/googletest/src/include/gtest/gtest-assertion-result.h b/third_party/googletest/src/include/gtest/gtest-assertion-result.h
new file mode 100644
index 0000000000..addbb59c64
--- /dev/null
+++ b/third_party/googletest/src/include/gtest/gtest-assertion-result.h
@@ -0,0 +1,237 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements the AssertionResult type.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251                                   \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+
+// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
+// This warning is not emitted in Visual Studio 2017.
+// This warning is off by default starting in Visual Studio 2019 but can be
+// enabled with command-line options.
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename std::enable_if<
+          !std::is_convertible<T, AssertionResult>::value>::type*
+      /*enabler*/
+      = nullptr)
+      : success_(success) {}
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true if and only if the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != nullptr ? message_->c_str() : "";
+  }
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T>
+  AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == nullptr) message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  std::unique_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-death-test.h b/third_party/googletest/src/include/gtest/gtest-death-test.h
index dc878ffbb3..84e5a5bbd3 100644
--- a/third_party/googletest/src/include/gtest/gtest-death-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-death-test.h
@@ -27,20 +27,20 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for death tests.  It is
 // #included by gtest.h so a user doesn't need to include this
 // directly.
-// GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#include "gtest/internal/gtest-death-test-internal.h"
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 
-namespace testing {
+#include "gtest/internal/gtest-death-test-internal.h"
 
 // This flag controls the style of death tests.  Valid values are "threadsafe",
 // meaning that the death test child process will re-execute the test binary
@@ -49,6 +49,8 @@ namespace testing {
 // after forking.
 GTEST_DECLARE_string_(death_test_style);
 
+namespace testing {
+
 #if GTEST_HAS_DEATH_TEST
 
 namespace internal {
@@ -97,9 +99,12 @@ GTEST_API_ bool InDeathTestChild();
 //
 //   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
 //
+// The final parameter to each of these macros is a matcher applied to any data
+// the sub-process wrote to stderr.  For compatibility with existing tests, a
+// bare string is interpreted as a regular expression matcher.
+//
 // On the regular expressions used in death tests:
 //
-//   GOOGLETEST_CM0005 DO NOT DELETE
 //   On POSIX-compliant systems (*nix), we use the <regex.h> library,
 //   which uses the POSIX extended regex syntax.
 //
@@ -162,27 +167,27 @@ GTEST_API_ bool InDeathTestChild();
 //   directory in PATH.
 //
 
-// Asserts that a given statement causes the program to exit, with an
-// integer exit status that satisfies predicate, and emitting error output
-// that matches regex.
-# define ASSERT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+// Asserts that a given `statement` causes the program to exit, with an
+// integer exit status that satisfies `predicate`, and emitting error output
+// that matches `matcher`.
+#define ASSERT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
 
-// Like ASSERT_EXIT, but continues on to successive tests in the
+// Like `ASSERT_EXIT`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
 
-// Asserts that a given statement causes the program to exit, either by
+// Asserts that a given `statement` causes the program to exit, either by
 // explicitly exiting with a nonzero exit code or being killed by a
-// signal, and emitting error output that matches regex.
-# define ASSERT_DEATH(statement, regex) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+// signal, and emitting error output that matches `matcher`.
+#define ASSERT_DEATH(statement, matcher) \
+  ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
-// Like ASSERT_DEATH, but continues on to successive tests in the
+// Like `ASSERT_DEATH`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_DEATH(statement, regex) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+#define EXPECT_DEATH(statement, matcher) \
+  EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
 
@@ -190,26 +195,26 @@ GTEST_API_ bool InDeathTestChild();
 class GTEST_API_ ExitedWithCode {
  public:
   explicit ExitedWithCode(int exit_code);
+  ExitedWithCode(const ExitedWithCode&) = default;
+  void operator=(const ExitedWithCode& other) = delete;
   bool operator()(int exit_status) const;
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ExitedWithCode& other);
 
+ private:
   const int exit_code_;
 };
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Tests that an exit code describes an exit due to termination by a
 // given signal.
-// GOOGLETEST_CM0006 DO NOT DELETE
 class GTEST_API_ KilledBySignal {
  public:
   explicit KilledBySignal(int signum);
   bool operator()(int exit_status) const;
+
  private:
   const int signum_;
 };
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
 // The death testing framework causes this to have interesting semantics,
@@ -254,23 +259,21 @@ class GTEST_API_ KilledBySignal {
 //   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
 // }, "death");
 //
-# ifdef NDEBUG
+#ifdef NDEBUG
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
+#define EXPECT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
+#define ASSERT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-# else
+#else
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  EXPECT_DEATH(statement, regex)
+#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  ASSERT_DEATH(statement, regex)
+#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
 
-# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // NDEBUG for EXPECT_DEBUG_DEATH
 #endif  // GTEST_HAS_DEATH_TEST
 
 // This macro is used for implementing macros such as
@@ -308,18 +311,17 @@ class GTEST_API_ KilledBySignal {
 //  statement unconditionally returns or throws. The Message constructor at
 //  the end allows the syntax of streaming additional messages into the
 //  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
+#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
+                        << "Statement '" #statement "' cannot be verified.";   \
+  } else if (::testing::internal::AlwaysFalse()) {                             \
+    ::testing::internal::RE::PartialMatch(".*", (regex));                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    terminator;                                                                \
+  } else                                                                       \
+    ::testing::Message()
 
 // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
 // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
@@ -327,17 +329,17 @@ class GTEST_API_ KilledBySignal {
 // useful when you are combining death test assertions with normal test
 // assertions in one test.
 #if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    ASSERT_DEATH(statement, regex)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  ASSERT_DEATH(statement, regex)
 #else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
 #endif
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-matchers.h b/third_party/googletest/src/include/gtest/gtest-matchers.h
index 9de6c2e10a..bffa00c533 100644
--- a/third_party/googletest/src/include/gtest/gtest-matchers.h
+++ b/third_party/googletest/src/include/gtest/gtest-matchers.h
@@ -32,13 +32,14 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
-// IWYU pragma: private, include "testing/base/public/gunit.h"
-// IWYU pragma: friend third_party/googletest/googlemock/.*
-// IWYU pragma: friend third_party/googletest/googletest/.*
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 
+#include <atomic>
 #include <memory>
 #include <ostream>
 #include <string>
@@ -63,20 +64,16 @@ GTEST_DISABLE_MSC_WARNINGS_PUSH_(
 namespace testing {
 
 // To implement a matcher Foo for type T, define:
-//   1. a class FooMatcherImpl that implements the
-//      MatcherInterface<T> interface, and
-//   2. a factory function that creates a Matcher<T> object from a
-//      FooMatcherImpl*.
-//
-// The two-level delegation design makes it possible to allow a user
-// to write "v" instead of "Eq(v)" where a Matcher is expected, which
-// is impossible if we pass matchers by pointers.  It also eases
-// ownership management as Matcher objects can now be copied like
-// plain values.
-
-// MatchResultListener is an abstract class.  Its << operator can be
-// used by a matcher to explain why a value matches or doesn't match.
+//   1. a class FooMatcherMatcher that implements the matcher interface:
+//     using is_gtest_matcher = void;
+//     bool MatchAndExplain(const T&, std::ostream*);
+//       (MatchResultListener* can also be used instead of std::ostream*)
+//     void DescribeTo(std::ostream*);
+//     void DescribeNegationTo(std::ostream*);
 //
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherMatcher.
+
 class MatchResultListener {
  public:
   // Creates a listener object with the given underlying ostream.  The
@@ -105,15 +102,15 @@ class MatchResultListener {
  private:
   ::std::ostream* const stream_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
+  MatchResultListener(const MatchResultListener&) = delete;
+  MatchResultListener& operator=(const MatchResultListener&) = delete;
 };
 
-inline MatchResultListener::~MatchResultListener() {
-}
+inline MatchResultListener::~MatchResultListener() {}
 
 // An instance of a subclass of this knows how to describe itself as a
 // matcher.
-class MatcherDescriberInterface {
+class GTEST_API_ MatcherDescriberInterface {
  public:
   virtual ~MatcherDescriberInterface() {}
 
@@ -181,54 +178,41 @@ class MatcherInterface : public MatcherDescriberInterface {
 
 namespace internal {
 
-// Converts a MatcherInterface<T> to a MatcherInterface<const T&>.
-template <typename T>
-class MatcherInterfaceAdapter : public MatcherInterface<const T&> {
- public:
-  explicit MatcherInterfaceAdapter(const MatcherInterface<T>* impl)
-      : impl_(impl) {}
-  ~MatcherInterfaceAdapter() override { delete impl_; }
-
-  void DescribeTo(::std::ostream* os) const override { impl_->DescribeTo(os); }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    impl_->DescribeNegationTo(os);
-  }
-
-  bool MatchAndExplain(const T& x,
-                       MatchResultListener* listener) const override {
-    return impl_->MatchAndExplain(x, listener);
-  }
-
- private:
-  const MatcherInterface<T>* const impl_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter);
-};
-
 struct AnyEq {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a == b; }
+  bool operator()(const A& a, const B& b) const {
+    return a == b;
+  }
 };
 struct AnyNe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a != b; }
+  bool operator()(const A& a, const B& b) const {
+    return a != b;
+  }
 };
 struct AnyLt {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a < b; }
+  bool operator()(const A& a, const B& b) const {
+    return a < b;
+  }
 };
 struct AnyGt {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a > b; }
+  bool operator()(const A& a, const B& b) const {
+    return a > b;
+  }
 };
 struct AnyLe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a <= b; }
+  bool operator()(const A& a, const B& b) const {
+    return a <= b;
+  }
 };
 struct AnyGe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a >= b; }
+  bool operator()(const A& a, const B& b) const {
+    return a >= b;
+  }
 };
 
 // A match result listener that ignores the explanation.
@@ -237,7 +221,8 @@ class DummyMatchResultListener : public MatchResultListener {
   DummyMatchResultListener() : MatchResultListener(nullptr) {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
+  DummyMatchResultListener(const DummyMatchResultListener&) = delete;
+  DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete;
 };
 
 // A match result listener that forwards the explanation to a given
@@ -249,19 +234,40 @@ class StreamMatchResultListener : public MatchResultListener {
       : MatchResultListener(os) {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
+  StreamMatchResultListener(const StreamMatchResultListener&) = delete;
+  StreamMatchResultListener& operator=(const StreamMatchResultListener&) =
+      delete;
+};
+
+struct SharedPayloadBase {
+  std::atomic<int> ref{1};
+  void Ref() { ref.fetch_add(1, std::memory_order_relaxed); }
+  bool Unref() { return ref.fetch_sub(1, std::memory_order_acq_rel) == 1; }
+};
+
+template <typename T>
+struct SharedPayload : SharedPayloadBase {
+  explicit SharedPayload(const T& v) : value(v) {}
+  explicit SharedPayload(T&& v) : value(std::move(v)) {}
+
+  static void Destroy(SharedPayloadBase* shared) {
+    delete static_cast<SharedPayload*>(shared);
+  }
+
+  T value;
 };
 
 // An internal class for implementing Matcher<T>, which will derive
 // from it.  We put functionalities common to all Matcher<T>
 // specializations here to avoid code duplication.
 template <typename T>
-class MatcherBase {
+class MatcherBase : private MatcherDescriberInterface {
  public:
   // Returns true if and only if the matcher matches x; also explains the
   // match result to 'listener'.
   bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
-    return impl_->MatchAndExplain(x, listener);
+    GTEST_CHECK_(vtable_ != nullptr);
+    return vtable_->match_and_explain(*this, x, listener);
   }
 
   // Returns true if and only if this matcher matches x.
@@ -271,11 +277,15 @@ class MatcherBase {
   }
 
   // Describes this matcher to an ostream.
-  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+  void DescribeTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, false);
+  }
 
   // Describes the negation of this matcher to an ostream.
-  void DescribeNegationTo(::std::ostream* os) const {
-    impl_->DescribeNegationTo(os);
+  void DescribeNegationTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, true);
   }
 
   // Explains why x matches, or doesn't match, the matcher.
@@ -288,31 +298,195 @@ class MatcherBase {
   // of the describer, which is only guaranteed to be alive when
   // this matcher object is alive.
   const MatcherDescriberInterface* GetDescriber() const {
-    return impl_.get();
+    if (vtable_ == nullptr) return nullptr;
+    return vtable_->get_describer(*this);
   }
 
  protected:
-  MatcherBase() {}
+  MatcherBase() : vtable_(nullptr), buffer_() {}
 
   // Constructs a matcher from its implementation.
-  explicit MatcherBase(const MatcherInterface<const T&>* impl) : impl_(impl) {}
-
   template <typename U>
-  explicit MatcherBase(
-      const MatcherInterface<U>* impl,
-      typename std::enable_if<!std::is_same<U, const U&>::value>::type* =
-          nullptr)
-      : impl_(new internal::MatcherInterfaceAdapter<U>(impl)) {}
+  explicit MatcherBase(const MatcherInterface<U>* impl)
+      : vtable_(nullptr), buffer_() {
+    Init(impl);
+  }
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  MatcherBase(M&& m) : vtable_(nullptr), buffer_() {  // NOLINT
+    Init(std::forward<M>(m));
+  }
+
+  MatcherBase(const MatcherBase& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    if (IsShared()) buffer_.shared->Ref();
+  }
+
+  MatcherBase& operator=(const MatcherBase& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    if (IsShared()) buffer_.shared->Ref();
+    return *this;
+  }
 
-  MatcherBase(const MatcherBase&) = default;
-  MatcherBase& operator=(const MatcherBase&) = default;
-  MatcherBase(MatcherBase&&) = default;
-  MatcherBase& operator=(MatcherBase&&) = default;
+  MatcherBase(MatcherBase&& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    other.vtable_ = nullptr;
+  }
 
-  virtual ~MatcherBase() {}
+  MatcherBase& operator=(MatcherBase&& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    other.vtable_ = nullptr;
+    return *this;
+  }
+
+  ~MatcherBase() override { Destroy(); }
 
  private:
-  std::shared_ptr<const MatcherInterface<const T&>> impl_;
+  struct VTable {
+    bool (*match_and_explain)(const MatcherBase&, const T&,
+                              MatchResultListener*);
+    void (*describe)(const MatcherBase&, std::ostream*, bool negation);
+    // Returns the captured object if it implements the interface, otherwise
+    // returns the MatcherBase itself.
+    const MatcherDescriberInterface* (*get_describer)(const MatcherBase&);
+    // Called on shared instances when the reference count reaches 0.
+    void (*shared_destroy)(SharedPayloadBase*);
+  };
+
+  bool IsShared() const {
+    return vtable_ != nullptr && vtable_->shared_destroy != nullptr;
+  }
+
+  // If the implementation uses a listener, call that.
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener->stream())) {
+    return P::Get(m).MatchAndExplain(value, listener->stream());
+  }
+
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener)) {
+    return P::Get(m).MatchAndExplain(value, listener);
+  }
+
+  template <typename P>
+  static void DescribeImpl(const MatcherBase& m, std::ostream* os,
+                           bool negation) {
+    if (negation) {
+      P::Get(m).DescribeNegationTo(os);
+    } else {
+      P::Get(m).DescribeTo(os);
+    }
+  }
+
+  template <typename P>
+  static const MatcherDescriberInterface* GetDescriberImpl(
+      const MatcherBase& m) {
+    // If the impl is a MatcherDescriberInterface, then return it.
+    // Otherwise use MatcherBase itself.
+    // This allows us to implement the GetDescriber() function without support
+    // from the impl, but some users really want to get their impl back when
+    // they call GetDescriber().
+    // We use std::get on a tuple as a workaround of not having `if constexpr`.
+    return std::get<(
+        std::is_convertible<decltype(&P::Get(m)),
+                            const MatcherDescriberInterface*>::value
+            ? 1
+            : 0)>(std::make_tuple(&m, &P::Get(m)));
+  }
+
+  template <typename P>
+  const VTable* GetVTable() {
+    static constexpr VTable kVTable = {&MatchAndExplainImpl<P>,
+                                       &DescribeImpl<P>, &GetDescriberImpl<P>,
+                                       P::shared_destroy};
+    return &kVTable;
+  }
+
+  union Buffer {
+    // Add some types to give Buffer some common alignment/size use cases.
+    void* ptr;
+    double d;
+    int64_t i;
+    // And add one for the out-of-line cases.
+    SharedPayloadBase* shared;
+  };
+
+  void Destroy() {
+    if (IsShared() && buffer_.shared->Unref()) {
+      vtable_->shared_destroy(buffer_.shared);
+    }
+  }
+
+  template <typename M>
+  static constexpr bool IsInlined() {
+    return sizeof(M) <= sizeof(Buffer) && alignof(M) <= alignof(Buffer) &&
+           std::is_trivially_copy_constructible<M>::value &&
+           std::is_trivially_destructible<M>::value;
+  }
+
+  template <typename M, bool = MatcherBase::IsInlined<M>()>
+  struct ValuePolicy {
+    static const M& Get(const MatcherBase& m) {
+      // When inlined along with Init, need to be explicit to avoid violating
+      // strict aliasing rules.
+      const M* ptr =
+          static_cast<const M*>(static_cast<const void*>(&m.buffer_));
+      return *ptr;
+    }
+    static void Init(MatcherBase& m, M impl) {
+      ::new (static_cast<void*>(&m.buffer_)) M(impl);
+    }
+    static constexpr auto shared_destroy = nullptr;
+  };
+
+  template <typename M>
+  struct ValuePolicy<M, false> {
+    using Shared = SharedPayload<M>;
+    static const M& Get(const MatcherBase& m) {
+      return static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    template <typename Arg>
+    static void Init(MatcherBase& m, Arg&& arg) {
+      m.buffer_.shared = new Shared(std::forward<Arg>(arg));
+    }
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename U, bool B>
+  struct ValuePolicy<const MatcherInterface<U>*, B> {
+    using M = const MatcherInterface<U>;
+    using Shared = SharedPayload<std::unique_ptr<M>>;
+    static const M& Get(const MatcherBase& m) {
+      return *static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    static void Init(MatcherBase& m, M* impl) {
+      m.buffer_.shared = new Shared(std::unique_ptr<M>(impl));
+    }
+
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename M>
+  void Init(M&& m) {
+    using MM = typename std::decay<M>::type;
+    using Policy = ValuePolicy<MM>;
+    vtable_ = GetVTable<Policy>();
+    Policy::Init(*this, std::forward<M>(m));
+  }
+
+  const VTable* vtable_;
+  Buffer buffer_;
 };
 
 }  // namespace internal
@@ -340,6 +514,10 @@ class Matcher : public internal::MatcherBase<T> {
           nullptr)
       : internal::MatcherBase<T>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m) : internal::MatcherBase<T>(std::forward<M>(m)) {}  // NOLINT
+
   // Implicit constructor here allows people to write
   // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
   Matcher(T value);  // NOLINT
@@ -357,6 +535,11 @@ class GTEST_API_ Matcher<const std::string&>
   explicit Matcher(const MatcherInterface<const std::string&>* impl)
       : internal::MatcherBase<const std::string&>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const std::string&>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
   Matcher(const std::string& s);  // NOLINT
@@ -376,6 +559,11 @@ class GTEST_API_ Matcher<std::string>
   explicit Matcher(const MatcherInterface<std::string>* impl)
       : internal::MatcherBase<std::string>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<std::string>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a string object.
   Matcher(const std::string& s);  // NOLINT
@@ -384,18 +572,24 @@ class GTEST_API_ Matcher<std::string>
   Matcher(const char* s);  // NOLINT
 };
 
-#if GTEST_HAS_ABSL
+#if GTEST_INTERNAL_HAS_STRING_VIEW
 // The following two specializations allow the user to write str
 // instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view
 // matcher is expected.
 template <>
-class GTEST_API_ Matcher<const absl::string_view&>
-    : public internal::MatcherBase<const absl::string_view&> {
+class GTEST_API_ Matcher<const internal::StringView&>
+    : public internal::MatcherBase<const internal::StringView&> {
  public:
   Matcher() {}
 
-  explicit Matcher(const MatcherInterface<const absl::string_view&>* impl)
-      : internal::MatcherBase<const absl::string_view&>(impl) {}
+  explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
+      : internal::MatcherBase<const internal::StringView&>(impl) {}
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const internal::StringView&>(std::forward<M>(m)) {
+  }
 
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
@@ -404,20 +598,25 @@ class GTEST_API_ Matcher<const absl::string_view&>
   // Allows the user to write "foo" instead of Eq("foo") sometimes.
   Matcher(const char* s);  // NOLINT
 
-  // Allows the user to pass absl::string_views directly.
-  Matcher(absl::string_view s);  // NOLINT
+  // Allows the user to pass absl::string_views or std::string_views directly.
+  Matcher(internal::StringView s);  // NOLINT
 };
 
 template <>
-class GTEST_API_ Matcher<absl::string_view>
-    : public internal::MatcherBase<absl::string_view> {
+class GTEST_API_ Matcher<internal::StringView>
+    : public internal::MatcherBase<internal::StringView> {
  public:
   Matcher() {}
 
-  explicit Matcher(const MatcherInterface<const absl::string_view&>* impl)
-      : internal::MatcherBase<absl::string_view>(impl) {}
-  explicit Matcher(const MatcherInterface<absl::string_view>* impl)
-      : internal::MatcherBase<absl::string_view>(impl) {}
+  explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
+      : internal::MatcherBase<internal::StringView>(impl) {}
+  explicit Matcher(const MatcherInterface<internal::StringView>* impl)
+      : internal::MatcherBase<internal::StringView>(impl) {}
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<internal::StringView>(std::forward<M>(m)) {}
 
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
@@ -426,10 +625,10 @@ class GTEST_API_ Matcher<absl::string_view>
   // Allows the user to write "foo" instead of Eq("foo") sometimes.
   Matcher(const char* s);  // NOLINT
 
-  // Allows the user to pass absl::string_views directly.
-  Matcher(absl::string_view s);  // NOLINT
+  // Allows the user to pass absl::string_views or std::string_views directly.
+  Matcher(internal::StringView s);  // NOLINT
 };
-#endif  // GTEST_HAS_ABSL
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
 
 // Prints a matcher in a human-readable format.
 template <typename T>
@@ -474,13 +673,13 @@ class PolymorphicMatcher {
    public:
     explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
 
-    virtual void DescribeTo(::std::ostream* os) const { impl_.DescribeTo(os); }
+    void DescribeTo(::std::ostream* os) const override { impl_.DescribeTo(os); }
 
-    virtual void DescribeNegationTo(::std::ostream* os) const {
+    void DescribeNegationTo(::std::ostream* os) const override {
       impl_.DescribeNegationTo(os);
     }
 
-    virtual bool MatchAndExplain(T x, MatchResultListener* listener) const {
+    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
       return impl_.MatchAndExplain(x, listener);
     }
 
@@ -529,37 +728,32 @@ template <typename D, typename Rhs, typename Op>
 class ComparisonBase {
  public:
   explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {}
+
+  using is_gtest_matcher = void;
+
   template <typename Lhs>
-  operator Matcher<Lhs>() const {
-    return Matcher<Lhs>(new Impl<const Lhs&>(rhs_));
+  bool MatchAndExplain(const Lhs& lhs, std::ostream*) const {
+    return Op()(lhs, Unwrap(rhs_));
+  }
+  void DescribeTo(std::ostream* os) const {
+    *os << D::Desc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
+  }
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << D::NegatedDesc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
   }
 
  private:
   template <typename T>
-  static const T& Unwrap(const T& v) { return v; }
+  static const T& Unwrap(const T& v) {
+    return v;
+  }
   template <typename T>
-  static const T& Unwrap(std::reference_wrapper<T> v) { return v; }
-
-  template <typename Lhs, typename = Rhs>
-  class Impl : public MatcherInterface<Lhs> {
-   public:
-    explicit Impl(const Rhs& rhs) : rhs_(rhs) {}
-    bool MatchAndExplain(Lhs lhs,
-                         MatchResultListener* /* listener */) const override {
-      return Op()(lhs, Unwrap(rhs_));
-    }
-    void DescribeTo(::std::ostream* os) const override {
-      *os << D::Desc() << " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << D::NegatedDesc() <<  " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
+  static const T& Unwrap(std::reference_wrapper<T> v) {
+    return v;
+  }
 
-   private:
-    Rhs rhs_;
-  };
   Rhs rhs_;
 };
 
@@ -567,7 +761,7 @@ template <typename Rhs>
 class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
  public:
   explicit EqMatcher(const Rhs& rhs)
-      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { }
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
   static const char* Desc() { return "is equal to"; }
   static const char* NegatedDesc() { return "isn't equal to"; }
 };
@@ -575,7 +769,7 @@ template <typename Rhs>
 class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
  public:
   explicit NeMatcher(const Rhs& rhs)
-      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { }
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
   static const char* Desc() { return "isn't equal to"; }
   static const char* NegatedDesc() { return "is equal to"; }
 };
@@ -583,7 +777,7 @@ template <typename Rhs>
 class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
  public:
   explicit LtMatcher(const Rhs& rhs)
-      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { }
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
   static const char* Desc() { return "is <"; }
   static const char* NegatedDesc() { return "isn't <"; }
 };
@@ -591,7 +785,7 @@ template <typename Rhs>
 class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
  public:
   explicit GtMatcher(const Rhs& rhs)
-      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { }
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
   static const char* Desc() { return "is >"; }
   static const char* NegatedDesc() { return "isn't >"; }
 };
@@ -599,7 +793,7 @@ template <typename Rhs>
 class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
  public:
   explicit LeMatcher(const Rhs& rhs)
-      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { }
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
   static const char* Desc() { return "is <="; }
   static const char* NegatedDesc() { return "isn't <="; }
 };
@@ -607,11 +801,15 @@ template <typename Rhs>
 class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
  public:
   explicit GeMatcher(const Rhs& rhs)
-      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { }
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
   static const char* Desc() { return "is >="; }
   static const char* NegatedDesc() { return "isn't >="; }
 };
 
+template <typename T, typename = typename std::enable_if<
+                          std::is_constructible<std::string, T>::value>::type>
+using StringLike = T;
+
 // Implements polymorphic matchers MatchesRegex(regex) and
 // ContainsRegex(regex), which can be used as a Matcher<T> as long as
 // T can be converted to a string.
@@ -620,12 +818,12 @@ class MatchesRegexMatcher {
   MatchesRegexMatcher(const RE* regex, bool full_match)
       : regex_(regex), full_match_(full_match) {}
 
-#if GTEST_HAS_ABSL
-  bool MatchAndExplain(const absl::string_view& s,
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
                        MatchResultListener* listener) const {
     return MatchAndExplain(std::string(s), listener);
   }
-#endif  // GTEST_HAS_ABSL
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
 
   // Accepts pointer types, particularly:
   //   const char*
@@ -672,9 +870,10 @@ inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
     const internal::RE* regex) {
   return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
 }
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
-    const std::string& regex) {
-  return MatchesRegex(new internal::RE(regex));
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::StringLike<T>& regex) {
+  return MatchesRegex(new internal::RE(std::string(regex)));
 }
 
 // Matches a string that contains regular expression 'regex'.
@@ -683,21 +882,26 @@ inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
     const internal::RE* regex) {
   return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
 }
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
-    const std::string& regex) {
-  return ContainsRegex(new internal::RE(regex));
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::StringLike<T>& regex) {
+  return ContainsRegex(new internal::RE(std::string(regex)));
 }
 
 // Creates a polymorphic matcher that matches anything equal to x.
 // Note: if the parameter of Eq() were declared as const T&, Eq("foo")
 // wouldn't compile.
 template <typename T>
-inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); }
+inline internal::EqMatcher<T> Eq(T x) {
+  return internal::EqMatcher<T>(x);
+}
 
 // Constructs a Matcher<T> from a 'value' of type T.  The constructed
 // matcher matches any value that's equal to 'value'.
 template <typename T>
-Matcher<T>::Matcher(T value) { *this = Eq(value); }
+Matcher<T>::Matcher(T value) {
+  *this = Eq(value);
+}
 
 // Creates a monomorphic matcher that matches anything with type Lhs
 // and equal to rhs.  A user may need to use this instead of Eq(...)
@@ -712,7 +916,9 @@ Matcher<T>::Matcher(T value) { *this = Eq(value); }
 // can always write Matcher<T>(Lt(5)) to be explicit about the type,
 // for example.
 template <typename Lhs, typename Rhs>
-inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); }
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) {
+  return Eq(rhs);
+}
 
 // Creates a polymorphic matcher that matches anything >= x.
 template <typename Rhs>
@@ -747,4 +953,4 @@ inline internal::NeMatcher<Rhs> Ne(Rhs x) {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-message.h b/third_party/googletest/src/include/gtest/gtest-message.h
index 4a80e11e6b..6c8bf90009 100644
--- a/third_party/googletest/src/include/gtest/gtest-message.h
+++ b/third_party/googletest/src/include/gtest/gtest-message.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the Message class.
@@ -42,13 +41,16 @@
 // to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
 // program!
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 
 #include <limits>
 #include <memory>
+#include <sstream>
 
 #include "gtest/internal/gtest-port.h"
 
@@ -109,8 +111,8 @@ class GTEST_API_ Message {
 
   // Streams a non-pointer value to this object.
   template <typename T>
-  inline Message& operator <<(const T& val) {
-    // Some libraries overload << for STL containers.  These
+  inline Message& operator<<(const T& val) {
+        // Some libraries overload << for STL containers.  These
     // overloads are defined in the global namespace instead of ::std.
     //
     // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
@@ -124,7 +126,7 @@ class GTEST_API_ Message {
     // from the global namespace.  With this using declaration,
     // overloads of << defined in the global namespace and those
     // visible via Koenig lookup are both exposed in this function.
-    using ::operator <<;
+    using ::operator<<;
     *ss_ << val;
     return *this;
   }
@@ -143,7 +145,7 @@ class GTEST_API_ Message {
   // ensure consistent result across compilers, we always treat NULL
   // as "(null)".
   template <typename T>
-  inline Message& operator <<(T* const& pointer) {  // NOLINT
+  inline Message& operator<<(T* const& pointer) {  // NOLINT
     if (pointer == nullptr) {
       *ss_ << "(null)";
     } else {
@@ -158,25 +160,23 @@ class GTEST_API_ Message {
   // templatized version above.  Without this definition, streaming
   // endl or other basic IO manipulators to Message will confuse the
   // compiler.
-  Message& operator <<(BasicNarrowIoManip val) {
+  Message& operator<<(BasicNarrowIoManip val) {
     *ss_ << val;
     return *this;
   }
 
   // Instead of 1/0, we want to see true/false for bool values.
-  Message& operator <<(bool b) {
-    return *this << (b ? "true" : "false");
-  }
+  Message& operator<<(bool b) { return *this << (b ? "true" : "false"); }
 
   // These two overloads allow streaming a wide C string to a Message
   // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str);
-  Message& operator <<(wchar_t* wide_c_str);
+  Message& operator<<(const wchar_t* wide_c_str);
+  Message& operator<<(wchar_t* wide_c_str);
 
 #if GTEST_HAS_STD_WSTRING
   // Converts the given wide string to a narrow string using the UTF-8
   // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::std::wstring& wstr);
+  Message& operator<<(const ::std::wstring& wstr);
 #endif  // GTEST_HAS_STD_WSTRING
 
   // Gets the text streamed to this object so far as an std::string.
@@ -195,7 +195,7 @@ class GTEST_API_ Message {
 };
 
 // Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+inline std::ostream& operator<<(std::ostream& os, const Message& sb) {
   return os << sb.GetString();
 }
 
@@ -215,4 +215,4 @@ std::string StreamableToString(const T& streamable) {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-param-test.h b/third_party/googletest/src/include/gtest/gtest-param-test.h
index c2e6eae3d8..b55119ac62 100644
--- a/third_party/googletest/src/include/gtest/gtest-param-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-param-test.h
@@ -26,16 +26,16 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Macros and functions for implementing parameterized tests
 // in Google C++ Testing and Mocking Framework (Google Test)
-//
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
 // Value-parameterized tests allow you to test your code with different
 // parameters without writing multiple copies of the same test.
@@ -356,9 +356,7 @@ internal::ValueArray<T...> Values(T... v) {
 // }
 // INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
 //
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
+inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
 
 // Combine() allows the user to combine two or more sequences to produce
 // values of a Cartesian product of those sequences' elements.
@@ -371,8 +369,6 @@ inline internal::ParamGenerator<bool> Bool() {
 //     std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
 //     of elements from sequences produces by gen1, gen2, ..., genN.
 //
-// Combine can have up to 10 arguments.
-//
 // Example:
 //
 // This will instantiate tests in test suite AnimalTest each one with
@@ -416,24 +412,28 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
       : public test_suite_name {                                               \
    public:                                                                     \
     GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                    \
-    virtual void TestBody();                                                   \
+    void TestBody() override;                                                  \
                                                                                \
    private:                                                                    \
     static int AddToRegistry() {                                               \
       ::testing::UnitTest::GetInstance()                                       \
           ->parameterized_test_registry()                                      \
           .GetTestSuitePatternHolder<test_suite_name>(                         \
-              #test_suite_name,                                                \
+              GTEST_STRINGIFY_(test_suite_name),                               \
               ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
           ->AddTestPattern(                                                    \
               GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name),  \
               new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_suite_name, test_name)>());                             \
+                  test_suite_name, test_name)>(),                              \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__));          \
       return 0;                                                                \
     }                                                                          \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,    \
-                                                           test_name));        \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
   };                                                                           \
   int GTEST_TEST_CLASS_NAME_(test_suite_name,                                  \
                              test_name)::gtest_registering_dummy_ =            \
@@ -457,39 +457,46 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
 #define GTEST_GET_FIRST_(first, ...) first
 #define GTEST_GET_SECOND_(first, second, ...) second
 
-#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)                \
-  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>      \
-      gtest_##prefix##test_suite_name##_EvalGenerator_() {                    \
-    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));        \
-  }                                                                           \
-  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(   \
-      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {     \
-    if (::testing::internal::AlwaysFalse()) {                                 \
-      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(      \
-          __VA_ARGS__,                                                        \
-          ::testing::internal::DefaultParamName<test_suite_name::ParamType>,  \
-          DUMMY_PARAM_)));                                                    \
-      auto t = std::make_tuple(__VA_ARGS__);                                  \
-      static_assert(std::tuple_size<decltype(t)>::value <= 2,                 \
-                    "Too Many Args!");                                        \
-    }                                                                         \
-    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                 \
-        __VA_ARGS__,                                                          \
-        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,    \
-        DUMMY_PARAM_))))(info);                                               \
-  }                                                                           \
-  static int gtest_##prefix##test_suite_name##_dummy_                         \
-      GTEST_ATTRIBUTE_UNUSED_ =                                               \
-          ::testing::UnitTest::GetInstance()                                  \
-              ->parameterized_test_registry()                                 \
-              .GetTestSuitePatternHolder<test_suite_name>(                    \
-                  #test_suite_name,                                           \
-                  ::testing::internal::CodeLocation(__FILE__, __LINE__))      \
-              ->AddTestSuiteInstantiation(                                    \
-                  #prefix, &gtest_##prefix##test_suite_name##_EvalGenerator_, \
-                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,       \
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)               \
+  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>     \
+      gtest_##prefix##test_suite_name##_EvalGenerator_() {                   \
+    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));       \
+  }                                                                          \
+  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(  \
+      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {    \
+    if (::testing::internal::AlwaysFalse()) {                                \
+      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(     \
+          __VA_ARGS__,                                                       \
+          ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+          DUMMY_PARAM_)));                                                   \
+      auto t = std::make_tuple(__VA_ARGS__);                                 \
+      static_assert(std::tuple_size<decltype(t)>::value <= 2,                \
+                    "Too Many Args!");                                       \
+    }                                                                        \
+    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                \
+        __VA_ARGS__,                                                         \
+        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,   \
+        DUMMY_PARAM_))))(info);                                              \
+  }                                                                          \
+  static int gtest_##prefix##test_suite_name##_dummy_                        \
+      GTEST_ATTRIBUTE_UNUSED_ =                                              \
+          ::testing::UnitTest::GetInstance()                                 \
+              ->parameterized_test_registry()                                \
+              .GetTestSuitePatternHolder<test_suite_name>(                   \
+                  GTEST_STRINGIFY_(test_suite_name),                         \
+                  ::testing::internal::CodeLocation(__FILE__, __LINE__))     \
+              ->AddTestSuiteInstantiation(                                   \
+                  GTEST_STRINGIFY_(prefix),                                  \
+                  &gtest_##prefix##test_suite_name##_EvalGenerator_,         \
+                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,      \
                   __FILE__, __LINE__)
 
+// Allow Marking a Parameterized test class as not needing to be instantiated.
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                  \
+  namespace gtest_do_not_use_outside_namespace_scope {}                   \
+  static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
+      GTEST_STRINGIFY_(T))
+
 // Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 #define INSTANTIATE_TEST_CASE_P                                            \
@@ -500,4 +507,4 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-printers.h b/third_party/googletest/src/include/gtest/gtest-printers.h
index 56a05450ef..a91e8b8b10 100644
--- a/third_party/googletest/src/include/gtest/gtest-printers.h
+++ b/third_party/googletest/src/include/gtest/gtest-printers.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -95,12 +94,15 @@
 // being defined as many user-defined container types don't have
 // value_type.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
 #include <functional>
+#include <memory>
 #include <ostream>  // NOLINT
 #include <sstream>
 #include <string>
@@ -108,64 +110,124 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
+
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-port.h"
 
-#if GTEST_HAS_ABSL
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
-#include "absl/types/variant.h"
-#endif  // GTEST_HAS_ABSL
-
 namespace testing {
 
-// Definitions in the 'internal' and 'internal2' name spaces are
-// subject to change without notice.  DO NOT USE THEM IN USER CODE!
-namespace internal2 {
+// Definitions in the internal* namespaces are subject to change without notice.
+// DO NOT USE THEM IN USER CODE!
+namespace internal {
 
-// Prints the given number of bytes in the given object to the given
-// ostream.
-GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
-
-// For selecting which printer to use when a given type has neither <<
-// nor PrintTo().
-enum TypeKind {
-  kProtobuf,              // a protobuf type
-  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
-                          // (e.g. a named or unnamed enum type)
-#if GTEST_HAS_ABSL
-  kConvertibleToStringView,  // a type implicitly convertible to
-                             // absl::string_view
-#endif
-  kOtherType  // anything else
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+struct ContainerPrinter {
+  template <typename T,
+            typename = typename std::enable_if<
+                (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+                !IsRecursiveContainer<T>::value>::type>
+  static void PrintValue(const T& container, std::ostream* os) {
+    const size_t kMaxCount = 32;  // The maximum number of elements to print.
+    *os << '{';
+    size_t count = 0;
+    for (auto&& elem : container) {
+      if (count > 0) {
+        *os << ',';
+        if (count == kMaxCount) {  // Enough has been printed.
+          *os << " ...";
+          break;
+        }
+      }
+      *os << ' ';
+      // We cannot call PrintTo(elem, os) here as PrintTo() doesn't
+      // handle `elem` being a native array.
+      internal::UniversalPrint(elem, os);
+      ++count;
+    }
+
+    if (count > 0) {
+      *os << ' ';
+    }
+    *os << '}';
+  }
 };
 
-// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
-// by the universal printer to print a value of type T when neither
-// operator<< nor PrintTo() is defined for T, where kTypeKind is the
-// "kind" of T as defined by enum TypeKind.
-template <typename T, TypeKind kTypeKind>
-class TypeWithoutFormatter {
- public:
-  // This default version is called when kTypeKind is kOtherType.
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+struct FunctionPointerPrinter {
+  template <typename T, typename = typename std::enable_if<
+                            std::is_function<T>::value>::type>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.
+      *os << reinterpret_cast<const void*>(p);
+    }
+  }
+};
+
+struct PointerPrinter {
+  template <typename T>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    }
+  }
+};
+
+namespace internal_stream_operator_without_lexical_name_lookup {
+
+// The presence of an operator<< here will terminate lexical scope lookup
+// straight away (even though it cannot be a match because of its argument
+// types). Thus, the two operator<< calls in StreamPrinter will find only ADL
+// candidates.
+struct LookupBlocker {};
+void operator<<(LookupBlocker, LookupBlocker);
+
+struct StreamPrinter {
+  template <typename T,
+            // Don't accept member pointers here. We'd print them via implicit
+            // conversion to bool, which isn't useful.
+            typename = typename std::enable_if<
+                !std::is_member_pointer<T>::value>::type,
+            // Only accept types for which we can find a streaming operator via
+            // ADL (possibly involving implicit conversions).
+            typename = decltype(std::declval<std::ostream&>()
+                                << std::declval<const T&>())>
   static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(
-        static_cast<const unsigned char*>(
-            reinterpret_cast<const void*>(std::addressof(value))),
-        sizeof(value), os);
+    // Call streaming operator found by ADL, possibly with implicit conversions
+    // of the arguments.
+    *os << value;
   }
 };
 
-// We print a protobuf using its ShortDebugString() when the string
-// doesn't exceed this many characters; otherwise we print it using
-// DebugString() for better readability.
-const size_t kProtobufOneLinerMaxLength = 50;
+}  // namespace internal_stream_operator_without_lexical_name_lookup
 
-template <typename T>
-class TypeWithoutFormatter<T, kProtobuf> {
- public:
+struct ProtobufPrinter {
+  // We print a protobuf using its ShortDebugString() when the string
+  // doesn't exceed this many characters; otherwise we print it using
+  // DebugString() for better readability.
+  static const size_t kProtobufOneLinerMaxLength = 50;
+
+  template <typename T,
+            typename = typename std::enable_if<
+                internal::HasDebugStringAndShortDebugString<T>::value>::type>
   static void PrintValue(const T& value, ::std::ostream* os) {
     std::string pretty_str = value.ShortDebugString();
     if (pretty_str.length() > kProtobufOneLinerMaxLength) {
@@ -175,9 +237,7 @@ class TypeWithoutFormatter<T, kProtobuf> {
   }
 };
 
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToInteger> {
- public:
+struct ConvertibleToIntegerPrinter {
   // Since T has no << operator or PrintTo() but can be implicitly
   // converted to BiggestInt, we print it as a BiggestInt.
   //
@@ -185,113 +245,72 @@ class TypeWithoutFormatter<T, kConvertibleToInteger> {
   // case printing it as an integer is the desired behavior.  In case
   // T is not an enum, printing it as an integer is the best we can do
   // given that it has no user-defined printer.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const internal::BiggestInt kBigInt = value;
-    *os << kBigInt;
+  static void PrintValue(internal::BiggestInt value, ::std::ostream* os) {
+    *os << value;
   }
 };
 
-#if GTEST_HAS_ABSL
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToStringView> {
- public:
-  // Since T has neither operator<< nor PrintTo() but can be implicitly
-  // converted to absl::string_view, we print it as a absl::string_view.
-  //
-  // Note: the implementation is further below, as it depends on
-  // internal::PrintTo symbol which is defined later in the file.
-  static void PrintValue(const T& value, ::std::ostream* os);
-};
+struct ConvertibleToStringViewPrinter {
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  static void PrintValue(internal::StringView value, ::std::ostream* os) {
+    internal::UniversalPrint(value, os);
+  }
 #endif
+};
 
-// Prints the given value to the given ostream.  If the value is a
-// protocol message, its debug string is printed; if it's an enum or
-// of a type implicitly convertible to BiggestInt, it's printed as an
-// integer; otherwise the bytes in the value are printed.  This is
-// what UniversalPrinter<T>::Print() does when it knows nothing about
-// type T and T has neither << operator nor PrintTo().
-//
-// A user can override this behavior for a class type Foo by defining
-// a << operator in the namespace where Foo is defined.
-//
-// We put this operator in namespace 'internal2' instead of 'internal'
-// to simplify the implementation, as much code in 'internal' needs to
-// use << in STL, which would conflict with our own << were it defined
-// in 'internal'.
-//
-// Note that this operator<< takes a generic std::basic_ostream<Char,
-// CharTraits> type instead of the more restricted std::ostream.  If
-// we define it to take an std::ostream instead, we'll get an
-// "ambiguous overloads" compiler error when trying to print a type
-// Foo that supports streaming to std::basic_ostream<Char,
-// CharTraits>, as the compiler cannot tell whether
-// operator<<(std::ostream&, const T&) or
-// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
-// specific.
-template <typename Char, typename CharTraits, typename T>
-::std::basic_ostream<Char, CharTraits>& operator<<(
-    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T, (internal::IsAProtocolMessage<T>::value
-                               ? kProtobuf
-                               : std::is_convertible<
-                                     const T&, internal::BiggestInt>::value
-                                     ? kConvertibleToInteger
-                                     :
-#if GTEST_HAS_ABSL
-                                     std::is_convertible<
-                                         const T&, absl::string_view>::value
-                                         ? kConvertibleToStringView
-                                         :
-#endif
-                                         kOtherType)>::PrintValue(x, &os);
-  return os;
-}
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count, ::std::ostream* os);
+struct RawBytesPrinter {
+  // SFINAE on `sizeof` to make sure we have a complete type.
+  template <typename T, size_t = sizeof(T)>
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(
+        static_cast<const unsigned char*>(
+            // Load bearing cast to void* to support iOS
+            reinterpret_cast<const void*>(std::addressof(value))),
+        sizeof(value), os);
+  }
+};
 
-}  // namespace internal2
-}  // namespace testing
+struct FallbackPrinter {
+  template <typename T>
+  static void PrintValue(const T&, ::std::ostream* os) {
+    *os << "(incomplete type)";
+  }
+};
 
-// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
-// magic needed for implementing UniversalPrinter won't work.
-namespace testing_internal {
+// Try every printer in order and return the first one that works.
+template <typename T, typename E, typename Printer, typename... Printers>
+struct FindFirstPrinter : FindFirstPrinter<T, E, Printers...> {};
 
-// Used to print a value that is not an STL-style container when the
-// user doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
-  // With the following statement, during unqualified name lookup,
-  // testing::internal2::operator<< appears as if it was declared in
-  // the nearest enclosing namespace that contains both
-  // ::testing_internal and ::testing::internal2, i.e. the global
-  // namespace.  For more details, refer to the C++ Standard section
-  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
-  // testing::internal2::operator<< in case T doesn't come with a <<
-  // operator.
-  //
-  // We cannot write 'using ::testing::internal2::operator<<;', which
-  // gcc 3.3 fails to compile due to a compiler bug.
-  using namespace ::testing::internal2;  // NOLINT
+template <typename T, typename Printer, typename... Printers>
+struct FindFirstPrinter<
+    T, decltype(Printer::PrintValue(std::declval<const T&>(), nullptr)),
+    Printer, Printers...> {
+  using type = Printer;
+};
 
-  // Assuming T is defined in namespace foo, in the next statement,
-  // the compiler will consider all of:
-  //
-  //   1. foo::operator<< (thanks to Koenig look-up),
-  //   2. ::operator<< (as the current namespace is enclosed in ::),
-  //   3. testing::internal2::operator<< (thanks to the using statement above).
-  //
-  // The operator<< whose type matches T best will be picked.
-  //
-  // We deliberately allow #2 to be a candidate, as sometimes it's
-  // impossible to define #1 (e.g. when foo is ::std, defining
-  // anything in it is undefined behavior unless you are a compiler
-  // vendor.).
-  *os << value;
+// Select the best printer in the following order:
+//  - Print containers (they have begin/end/etc).
+//  - Print function pointers.
+//  - Print object pointers.
+//  - Use the stream operator, if available.
+//  - Print protocol buffers.
+//  - Print types convertible to BiggestInt.
+//  - Print types convertible to StringView, if available.
+//  - Fallback to printing the raw bytes of the object.
+template <typename T>
+void PrintWithFallback(const T& value, ::std::ostream* os) {
+  using Printer = typename FindFirstPrinter<
+      T, void, ContainerPrinter, FunctionPointerPrinter, PointerPrinter,
+      internal_stream_operator_without_lexical_name_lookup::StreamPrinter,
+      ProtobufPrinter, ConvertibleToIntegerPrinter,
+      ConvertibleToStringViewPrinter, RawBytesPrinter, FallbackPrinter>::type;
+  Printer::PrintValue(value, os);
 }
 
-}  // namespace testing_internal
-
-namespace testing {
-namespace internal {
-
 // FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
 // value of type ToPrint that is an operand of a comparison assertion
 // (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
@@ -340,6 +359,14 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+#ifdef __cpp_lib_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char32_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
 
 #undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
 
@@ -347,16 +374,24 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
 // to point to a NUL-terminated string, and thus can print it as a string.
 
 #define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
+  template <>                                                            \
+  class FormatForComparison<CharType*, OtherStringType> {                \
+   public:                                                               \
+    static ::std::string Format(CharType* value) {                       \
+      return ::testing::PrintToString(value);                            \
+    }                                                                    \
   }
 
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+#ifdef __cpp_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char32_t, ::std::u32string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char32_t, ::std::u32string);
 
 #if GTEST_HAS_STD_WSTRING
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
@@ -374,8 +409,8 @@ GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
+std::string FormatForComparisonFailureMessage(const T1& value,
+                                              const T2& /* other_operand */) {
   return FormatForComparison<T1, T2>::Format(value);
 }
 
@@ -389,85 +424,6 @@ std::string FormatForComparisonFailureMessage(
 template <typename T>
 class UniversalPrinter;
 
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os);
-
-enum DefaultPrinterType {
-  kPrintContainer,
-  kPrintPointer,
-  kPrintFunctionPointer,
-  kPrintOther,
-};
-template <DefaultPrinterType type> struct WrapPrinterType {};
-
-// Used to print an STL-style container when the user doesn't define
-// a PrintTo() for it.
-template <typename C>
-void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
-                    const C& container, ::std::ostream* os) {
-  const size_t kMaxCount = 32;  // The maximum number of elements to print.
-  *os << '{';
-  size_t count = 0;
-  for (typename C::const_iterator it = container.begin();
-       it != container.end(); ++it, ++count) {
-    if (count > 0) {
-      *os << ',';
-      if (count == kMaxCount) {  // Enough has been printed.
-        *os << " ...";
-        break;
-      }
-    }
-    *os << ' ';
-    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
-    // handle *it being a native array.
-    internal::UniversalPrint(*it, os);
-  }
-
-  if (count > 0) {
-    *os << ' ';
-  }
-  *os << '}';
-}
-
-// Used to print a pointer that is neither a char pointer nor a member
-// pointer, when the user doesn't define PrintTo() for it.  (A member
-// variable pointer or member function pointer doesn't really point to
-// a location in the address space.  Their representation is
-// implementation-defined.  Therefore they will be printed as raw
-// bytes.)
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */,
-                    T* p, ::std::ostream* os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is not a function type.  We just call << to print p,
-    // relying on ADL to pick up user-defined << for their pointer
-    // types, if any.
-    *os << p;
-  }
-}
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */,
-                    T* p, ::std::ostream* os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is a function type, so '*os << p' doesn't do what we want
-    // (it just prints p as bool).  We want to print p as a const
-    // void*.
-    *os << reinterpret_cast<const void*>(p);
-  }
-}
-
-// Used to print a non-container, non-pointer value when the user
-// doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
-                    const T& value, ::std::ostream* os) {
-  ::testing_internal::DefaultPrintNonContainerTo(value, os);
-}
-
 // Prints the given value using the << operator if it has one;
 // otherwise prints the bytes in it.  This is what
 // UniversalPrinter<T>::Print() does when PrintTo() is not specialized
@@ -481,36 +437,7 @@ void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
 // wants).
 template <typename T>
 void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first argument
-  // determines which version will be picked.
-  //
-  // Note that we check for container types here, prior to we check
-  // for protocol message types in our operator<<.  The rationale is:
-  //
-  // For protocol messages, we want to give people a chance to
-  // override Google Mock's format by defining a PrintTo() or
-  // operator<<.  For STL containers, other formats can be
-  // incompatible with Google Mock's format for the container
-  // elements; therefore we check for container types here to ensure
-  // that our format is used.
-  //
-  // Note that MSVC and clang-cl do allow an implicit conversion from
-  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
-  // So don't use ImplicitlyConvertible if it can be helped since it will
-  // cause this warning, and use a separate overload of DefaultPrintTo for
-  // function pointers so that the `*os << p` in the object pointer overload
-  // doesn't cause that warning either.
-  DefaultPrintTo(
-      WrapPrinterType <
-                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
-              !IsRecursiveContainer<T>::value
-          ? kPrintContainer
-          : !std::is_pointer<T>::value
-                ? kPrintOther
-                : std::is_function<typename std::remove_pointer<T>::type>::value
-                      ? kPrintFunctionPointer
-                      : kPrintPointer > (),
-      value, os);
+  internal::PrintWithFallback(value, os);
 }
 
 // The following list of PrintTo() overloads tells
@@ -541,6 +468,22 @@ inline void PrintTo(bool x, ::std::ostream* os) {
 // is implemented as an unsigned type.
 GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
 
+GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os);
+inline void PrintTo(char16_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#ifdef __cpp_char8_t
+inline void PrintTo(char8_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#endif
+
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os);
+GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os);
+#endif  // __SIZEOF_INT128__
+
 // Overloads for C strings.
 GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
 inline void PrintTo(char* s, ::std::ostream* os) {
@@ -561,6 +504,23 @@ inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
 inline void PrintTo(unsigned char* s, ::std::ostream* os) {
   PrintTo(ImplicitCast_<const void*>(s), os);
 }
+#ifdef __cpp_char8_t
+// Overloads for u8 strings.
+GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os);
+inline void PrintTo(char8_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char8_t*>(s), os);
+}
+#endif
+// Overloads for u16 strings.
+GTEST_API_ void PrintTo(const char16_t* s, ::std::ostream* os);
+inline void PrintTo(char16_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char16_t*>(s), os);
+}
+// Overloads for u32 strings.
+GTEST_API_ void PrintTo(const char32_t* s, ::std::ostream* os);
+inline void PrintTo(char32_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char32_t*>(s), os);
+}
 
 // MSVC can be configured to define wchar_t as a typedef of unsigned
 // short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
@@ -590,33 +550,96 @@ void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
 }
 
 // Overloads for ::std::string.
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os);
 inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
   PrintStringTo(s, os);
 }
 
+// Overloads for ::std::u8string
+#ifdef __cpp_char8_t
+GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) {
+  PrintU8StringTo(s, os);
+}
+#endif
+
+// Overloads for ::std::u16string
+GTEST_API_ void PrintU16StringTo(const ::std::u16string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u16string& s, ::std::ostream* os) {
+  PrintU16StringTo(s, os);
+}
+
+// Overloads for ::std::u32string
+GTEST_API_ void PrintU32StringTo(const ::std::u32string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
+  PrintU32StringTo(s, os);
+}
+
 // Overloads for ::std::wstring.
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os);
 inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
   PrintWideStringTo(s, os);
 }
 #endif  // GTEST_HAS_STD_WSTRING
 
-#if GTEST_HAS_ABSL
-// Overload for absl::string_view.
-inline void PrintTo(absl::string_view sp, ::std::ostream* os) {
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Overload for internal::StringView.
+inline void PrintTo(internal::StringView sp, ::std::ostream* os) {
   PrintTo(::std::string(sp), os);
 }
-#endif  // GTEST_HAS_ABSL
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
 
 inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
 
+#if GTEST_HAS_RTTI
+inline void PrintTo(const std::type_info& info, std::ostream* os) {
+  *os << internal::GetTypeName(info);
+}
+#endif  // GTEST_HAS_RTTI
+
 template <typename T>
 void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
   UniversalPrinter<T&>::Print(ref.get(), os);
 }
 
+inline const void* VoidifyPointer(const void* p) { return p; }
+inline const void* VoidifyPointer(volatile const void* p) {
+  return const_cast<const void*>(p);
+}
+
+template <typename T, typename Ptr>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, char) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    // We can't print the value. Just print the pointer..
+    *os << "(" << (VoidifyPointer)(ptr.get()) << ")";
+  }
+}
+template <typename T, typename Ptr,
+          typename = typename std::enable_if<!std::is_void<T>::value &&
+                                             !std::is_array<T>::value>::type>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, int) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    *os << "(ptr = " << (VoidifyPointer)(ptr.get()) << ", value = ";
+    UniversalPrinter<T>::Print(*ptr, os);
+    *os << ")";
+  }
+}
+
+template <typename T, typename D>
+void PrintTo(const std::unique_ptr<T, D>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
+template <typename T>
+void PrintTo(const std::shared_ptr<T>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
 // Helper function for printing a tuple.  T must be instantiated with
 // a tuple type.
 template <typename T>
@@ -682,14 +705,46 @@ class UniversalPrinter {
   GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
-#if GTEST_HAS_ABSL
+// Remove any const-qualifiers before passing a type to UniversalPrinter.
+template <typename T>
+class UniversalPrinter<const T> : public UniversalPrinter<T> {};
+
+#if GTEST_INTERNAL_HAS_ANY
+
+// Printer for std::any / absl::any
+
+template <>
+class UniversalPrinter<Any> {
+ public:
+  static void Print(const Any& value, ::std::ostream* os) {
+    if (value.has_value()) {
+      *os << "value of type " << GetTypeName(value);
+    } else {
+      *os << "no value";
+    }
+  }
+
+ private:
+  static std::string GetTypeName(const Any& value) {
+#if GTEST_HAS_RTTI
+    return internal::GetTypeName(value.type());
+#else
+    static_cast<void>(value);  // possibly unused
+    return "<unknown_type>";
+#endif  // GTEST_HAS_RTTI
+  }
+};
+
+#endif  // GTEST_INTERNAL_HAS_ANY
 
-// Printer for absl::optional
+#if GTEST_INTERNAL_HAS_OPTIONAL
+
+// Printer for std::optional / absl::optional
 
 template <typename T>
-class UniversalPrinter<::absl::optional<T>> {
+class UniversalPrinter<Optional<T>> {
  public:
-  static void Print(const ::absl::optional<T>& value, ::std::ostream* os) {
+  static void Print(const Optional<T>& value, ::std::ostream* os) {
     *os << '(';
     if (!value) {
       *os << "nullopt";
@@ -700,14 +755,30 @@ class UniversalPrinter<::absl::optional<T>> {
   }
 };
 
-// Printer for absl::variant
+template <>
+class UniversalPrinter<decltype(Nullopt())> {
+ public:
+  static void Print(decltype(Nullopt()), ::std::ostream* os) {
+    *os << "(nullopt)";
+  }
+};
+
+#endif  // GTEST_INTERNAL_HAS_OPTIONAL
+
+#if GTEST_INTERNAL_HAS_VARIANT
+
+// Printer for std::variant / absl::variant
 
 template <typename... T>
-class UniversalPrinter<::absl::variant<T...>> {
+class UniversalPrinter<Variant<T...>> {
  public:
-  static void Print(const ::absl::variant<T...>& value, ::std::ostream* os) {
+  static void Print(const Variant<T...>& value, ::std::ostream* os) {
     *os << '(';
-    absl::visit(Visitor{os}, value);
+#if GTEST_HAS_ABSL
+    absl::visit(Visitor{os, value.index()}, value);
+#else
+    std::visit(Visitor{os, value.index()}, value);
+#endif  // GTEST_HAS_ABSL
     *os << ')';
   }
 
@@ -715,14 +786,16 @@ class UniversalPrinter<::absl::variant<T...>> {
   struct Visitor {
     template <typename U>
     void operator()(const U& u) const {
-      *os << "'" << GetTypeName<U>() << "' with value ";
+      *os << "'" << GetTypeName<U>() << "(index = " << index
+          << ")' with value ";
       UniversalPrint(u, os);
     }
     ::std::ostream* os;
+    std::size_t index;
   };
 };
 
-#endif  // GTEST_HAS_ABSL
+#endif  // GTEST_INTERNAL_HAS_VARIANT
 
 // UniversalPrintArray(begin, len, os) prints an array of 'len'
 // elements, starting at address 'begin'.
@@ -748,12 +821,26 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
   }
 }
 // This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const char* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const char* begin, size_t len,
+                                    ::std::ostream* os);
+
+#ifdef __cpp_char8_t
+// This overload prints a (const) char8_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len,
+                                    ::std::ostream* os);
+#endif
+
+// This overload prints a (const) char16_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char16_t* begin, size_t len,
+                                    ::std::ostream* os);
+
+// This overload prints a (const) char32_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
+                                    ::std::ostream* os);
 
 // This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const wchar_t* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len,
+                                    ::std::ostream* os);
 
 // Implements printing an array type T[N].
 template <typename T, size_t N>
@@ -823,12 +910,55 @@ class UniversalTersePrinter<const char*> {
   }
 };
 template <>
-class UniversalTersePrinter<char*> {
+class UniversalTersePrinter<char*> : public UniversalTersePrinter<const char*> {
+};
+
+#ifdef __cpp_char8_t
+template <>
+class UniversalTersePrinter<const char8_t*> {
  public:
-  static void Print(char* str, ::std::ostream* os) {
-    UniversalTersePrinter<const char*>::Print(str, os);
+  static void Print(const char8_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u8string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char8_t*>
+    : public UniversalTersePrinter<const char8_t*> {};
+#endif
+
+template <>
+class UniversalTersePrinter<const char16_t*> {
+ public:
+  static void Print(const char16_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u16string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char16_t*>
+    : public UniversalTersePrinter<const char16_t*> {};
+
+template <>
+class UniversalTersePrinter<const char32_t*> {
+ public:
+  static void Print(const char32_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u32string(str), os);
+    }
   }
 };
+template <>
+class UniversalTersePrinter<char32_t*>
+    : public UniversalTersePrinter<const char32_t*> {};
 
 #if GTEST_HAS_STD_WSTRING
 template <>
@@ -869,10 +999,10 @@ void UniversalPrint(const T& value, ::std::ostream* os) {
   UniversalPrinter<T1>::Print(value, os);
 }
 
-typedef ::std::vector< ::std::string> Strings;
+typedef ::std::vector<::std::string> Strings;
 
-  // Tersely prints the first N fields of a tuple to a string vector,
-  // one element for each field.
+// Tersely prints the first N fields of a tuple to a string vector,
+// one element for each field.
 template <typename Tuple>
 void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
                                Strings*) {}
@@ -901,16 +1031,6 @@ Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
 
 }  // namespace internal
 
-#if GTEST_HAS_ABSL
-namespace internal2 {
-template <typename T>
-void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
-    const T& value, ::std::ostream* os) {
-  internal::PrintTo(absl::string_view(value), os);
-}
-}  // namespace internal2
-#endif
-
 template <typename T>
 ::std::string PrintToString(const T& value) {
   ::std::stringstream ss;
@@ -925,4 +1045,4 @@ ::std::string PrintToString(const T& value) {
 // declarations from this file.
 #include "gtest/internal/custom/gtest-printers.h"
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-spi.h b/third_party/googletest/src/include/gtest/gtest-spi.h
index aa38870e8e..bec8c4810b 100644
--- a/third_party/googletest/src/include/gtest/gtest-spi.h
+++ b/third_party/googletest/src/include/gtest/gtest-spi.h
@@ -27,14 +27,11 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // Utilities for testing Google Test itself and code that uses Google Test
 // (e.g. frameworks built on top of Google Test).
 
-// GOOGLETEST_CM0004 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 
 #include "gtest/gtest.h"
 
@@ -88,7 +85,10 @@ class GTEST_API_ ScopedFakeTestPartResultReporter
   TestPartResultReporterInterface* old_reporter_;
   TestPartResultArray* const result_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+  ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) =
+      delete;
+  ScopedFakeTestPartResultReporter& operator=(
+      const ScopedFakeTestPartResultReporter&) = delete;
 };
 
 namespace internal {
@@ -104,12 +104,14 @@ class GTEST_API_ SingleFailureChecker {
   SingleFailureChecker(const TestPartResultArray* results,
                        TestPartResult::Type type, const std::string& substr);
   ~SingleFailureChecker();
+
  private:
   const TestPartResultArray* const results_;
   const TestPartResult::Type type_;
   const std::string substr_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+  SingleFailureChecker(const SingleFailureChecker&) = delete;
+  SingleFailureChecker& operator=(const SingleFailureChecker&) = delete;
 };
 
 }  // namespace internal
@@ -119,7 +121,8 @@ class GTEST_API_ SingleFailureChecker {
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
 // A set of macros for testing Google Test assertions or code that's expected
-// to generate Google Test fatal failures.  It verifies that the given
+// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but
+// not a non-fatal failure, as from EXPECT_EQ).  It verifies that the given
 // statement will cause exactly one fatal Google Test failure with 'substr'
 // being part of the failure message.
 //
@@ -141,44 +144,46 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // helper macro, due to some peculiarity in how the preprocessor
 // works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
 // gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE(statement, substr)                               \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::                       \
+              INTERCEPT_ONLY_CURRENT_THREAD,                                  \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ALL_THREADS, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr)                \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
 // A macro for testing Google Test assertions or code that's expected to
-// generate Google Test non-fatal failures.  It asserts that the given
-// statement will cause exactly one non-fatal Google Test failure with 'substr'
-// being part of the failure message.
+// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ,
+// but not from an ASSERT_EQ). It asserts that the given statement will cause
+// exactly one non-fatal Google Test failure with 'substr' being part of the
+// failure message.
 //
 // There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
 // affects and considers failures generated in the current thread and
@@ -207,32 +212,37 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // instead of
 //   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 // to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
+#define EXPECT_NONFATAL_FAILURE(statement, substr)                    \
+  do {                                                                \
+    ::testing::TestPartResultArray gtest_failures;                    \
+    ::testing::internal::SingleFailureChecker gtest_checker(          \
         &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+        (substr));                                                    \
+    {                                                                 \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(     \
+          ::testing::ScopedFakeTestPartResultReporter::               \
+              INTERCEPT_ONLY_CURRENT_THREAD,                          \
+          &gtest_failures);                                           \
+      if (::testing::internal::AlwaysTrue()) {                        \
+        statement;                                                    \
+      }                                                               \
+    }                                                                 \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr)             \
+  do {                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure,         \
+        (substr));                                                            \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
           ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+          &gtest_failures);                                                   \
+      if (::testing::internal::AlwaysTrue()) {                                \
+        statement;                                                            \
+      }                                                                       \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-test-part.h b/third_party/googletest/src/include/gtest/gtest-test-part.h
index 05a7985358..09cc8c34f0 100644
--- a/third_party/googletest/src/include/gtest/gtest-test-part.h
+++ b/third_party/googletest/src/include/gtest/gtest-test-part.h
@@ -26,14 +26,17 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 
 #include <iosfwd>
 #include <vector>
+
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 
@@ -142,7 +145,8 @@ class GTEST_API_ TestPartResultArray {
  private:
   std::vector<TestPartResult> array_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+  TestPartResultArray(const TestPartResultArray&) = delete;
+  TestPartResultArray& operator=(const TestPartResultArray&) = delete;
 };
 
 // This interface knows how to report a test part result.
@@ -168,11 +172,13 @@ class GTEST_API_ HasNewFatalFailureHelper
   ~HasNewFatalFailureHelper() override;
   void ReportTestPartResult(const TestPartResult& result) override;
   bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+
  private:
   bool has_new_fatal_failure_;
   TestPartResultReporterInterface* original_reporter_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+  HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete;
+  HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete;
 };
 
 }  // namespace internal
@@ -181,4 +187,4 @@ class GTEST_API_ HasNewFatalFailureHelper
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/third_party/googletest/src/include/gtest/gtest-typed-test.h b/third_party/googletest/src/include/gtest/gtest-typed-test.h
index 095ce05802..bd35a32660 100644
--- a/third_party/googletest/src/include/gtest/gtest-typed-test.h
+++ b/third_party/googletest/src/include/gtest/gtest-typed-test.h
@@ -27,11 +27,12 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 
 // This header implements typed tests and type-parameterized tests.
 
@@ -170,13 +171,12 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
 
 #endif  // 0
 
+#include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-port.h"
 #include "gtest/internal/gtest-type-util.h"
 
 // Implements typed tests.
 
-#if GTEST_HAS_TYPED_TEST
-
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the name of the typedef for the type parameters of the
@@ -188,24 +188,25 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
 #define GTEST_NAME_GENERATOR_(TestSuiteName) \
   gtest_type_params_##TestSuiteName##_NameGenerator
 
-#define TYPED_TEST_SUITE(CaseName, Types, ...)                           \
-  typedef ::testing::internal::TypeList<Types>::type GTEST_TYPE_PARAMS_( \
-      CaseName);                                                         \
-  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type  \
-      GTEST_NAME_GENERATOR_(CaseName)
+#define TYPED_TEST_SUITE(CaseName, Types, ...)                          \
+  typedef ::testing::internal::GenerateTypeList<Types>::type            \
+      GTEST_TYPE_PARAMS_(CaseName);                                     \
+  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \
+  GTEST_NAME_GENERATOR_(CaseName)
 
-# define TYPED_TEST(CaseName, TestName)                                       \
+#define TYPED_TEST(CaseName, TestName)                                        \
+  static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1,                       \
+                "test-name must not be empty");                               \
   template <typename gtest_TypeParam_>                                        \
   class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
       : public CaseName<gtest_TypeParam_> {                                   \
    private:                                                                   \
     typedef CaseName<gtest_TypeParam_> TestFixture;                           \
     typedef gtest_TypeParam_ TypeParam;                                       \
-    virtual void TestBody();                                                  \
+    void TestBody() override;                                                 \
   };                                                                          \
   static bool gtest_##CaseName##_##TestName##_registered_                     \
-        GTEST_ATTRIBUTE_UNUSED_ =                                             \
-      ::testing::internal::TypeParameterizedTest<                             \
+      GTEST_ATTRIBUTE_UNUSED_ = ::testing::internal::TypeParameterizedTest<   \
           CaseName,                                                           \
           ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
                                                                   TestName)>, \
@@ -213,7 +214,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
               CaseName)>::Register("",                                        \
                                    ::testing::internal::CodeLocation(         \
                                        __FILE__, __LINE__),                   \
-                                   #CaseName, #TestName, 0,                   \
+                                   GTEST_STRINGIFY_(CaseName),                \
+                                   GTEST_STRINGIFY_(TestName), 0,             \
                                    ::testing::internal::GenerateNames<        \
                                        GTEST_NAME_GENERATOR_(CaseName),       \
                                        GTEST_TYPE_PARAMS_(CaseName)>());      \
@@ -228,12 +230,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   TYPED_TEST_SUITE
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#endif  // GTEST_HAS_TYPED_TEST
-
 // Implements type-parameterized tests.
 
-#if GTEST_HAS_TYPED_TEST_P
-
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the namespace name that the type-parameterized tests for
@@ -260,7 +258,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
 // #included in multiple translation units linked together.
 #define TYPED_TEST_SUITE_P(SuiteName)              \
   static ::testing::internal::TypedTestSuitePState \
-      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
+  GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
 
 // Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
@@ -276,24 +274,26 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
      private:                                                         \
       typedef SuiteName<gtest_TypeParam_> TestFixture;                \
       typedef gtest_TypeParam_ TypeParam;                             \
-      virtual void TestBody();                                        \
+      void TestBody() override;                                       \
     };                                                                \
     static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
         GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
-            __FILE__, __LINE__, #SuiteName, #TestName);               \
+            __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),          \
+            GTEST_STRINGIFY_(TestName));                              \
   }                                                                   \
   template <typename gtest_TypeParam_>                                \
   void GTEST_SUITE_NAMESPACE_(                                        \
       SuiteName)::TestName<gtest_TypeParam_>::TestBody()
 
-#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...)                            \
-  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                                \
-    typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
-  }                                                                            \
-  static const char* const GTEST_REGISTERED_TEST_NAMES_(                       \
-      SuiteName) GTEST_ATTRIBUTE_UNUSED_ =                                     \
-      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames(    \
-          __FILE__, __LINE__, #__VA_ARGS__)
+// Note: this won't work correctly if the trailing arguments are macros.
+#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...)                         \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                             \
+    typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_;    \
+  }                                                                         \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(                    \
+      SuiteName) GTEST_ATTRIBUTE_UNUSED_ =                                  \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \
+          GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__)
 
 // Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
@@ -303,19 +303,22 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   REGISTER_TYPED_TEST_SUITE_P
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)       \
-  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =        \
-      ::testing::internal::TypeParameterizedTestSuite<                      \
-          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,    \
-          ::testing::internal::TypeList<Types>::type>::                     \
-          Register(#Prefix,                                                 \
-                   ::testing::internal::CodeLocation(__FILE__, __LINE__),   \
-                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName), #SuiteName, \
-                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),                 \
-                   ::testing::internal::GenerateNames<                      \
-                       ::testing::internal::NameGeneratorSelector<          \
-                           __VA_ARGS__>::type,                              \
-                       ::testing::internal::TypeList<Types>::type>())
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)     \
+  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                     \
+                "test-suit-prefix must not be empty");                    \
+  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =      \
+      ::testing::internal::TypeParameterizedTestSuite<                    \
+          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,  \
+          ::testing::internal::GenerateTypeList<Types>::type>::           \
+          Register(GTEST_STRINGIFY_(Prefix),                              \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),           \
+                   GTEST_STRINGIFY_(SuiteName),                           \
+                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),               \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
+                       ::testing::internal::GenerateTypeList<Types>::type>())
 
 // Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
@@ -325,6 +328,4 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   INSTANTIATE_TYPED_TEST_SUITE_P
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h
index dbe5b1c2c3..d19a587a18 100644
--- a/third_party/googletest/src/include/gtest/gtest.h
+++ b/third_party/googletest/src/include/gtest/gtest.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for Google Test.  It should be
@@ -47,10 +46,8 @@
 // registration from Barthelemy Dagenais' (barthelemy@prologique.com)
 // easyUnit framework.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 
 #include <cstddef>
 #include <limits>
@@ -59,31 +56,22 @@
 #include <type_traits>
 #include <vector>
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-string.h"
+#include "gtest/gtest-assertion-result.h"
 #include "gtest/gtest-death-test.h"
 #include "gtest/gtest-matchers.h"
 #include "gtest/gtest-message.h"
 #include "gtest/gtest-param-test.h"
 #include "gtest/gtest-printers.h"
-#include "gtest/gtest_prod.h"
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest-typed-test.h"
+#include "gtest/gtest_pred_impl.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
-namespace testing {
-
-// Silence C4100 (unreferenced formal parameter) and 4805
-// unsafe mix of type 'const int' and type 'const bool'
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4805)
-# pragma warning(disable:4100)
-#endif
-
-
 // Declares the flags.
 
 // This flag temporary enables the disabled tests.
@@ -101,6 +89,10 @@ GTEST_DECLARE_bool_(catch_exceptions);
 // to let Google Test decide.
 GTEST_DECLARE_string_(color);
 
+// This flag controls whether the test runner should continue execution past
+// first failure.
+GTEST_DECLARE_bool_(fail_fast);
+
 // This flag sets up the filter to select by name using a glob pattern
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
@@ -117,6 +109,9 @@ GTEST_DECLARE_bool_(list_tests);
 // in addition to its normal textual output.
 GTEST_DECLARE_string_(output);
 
+// This flags control whether Google Test prints only test failures.
+GTEST_DECLARE_bool_(brief);
+
 // This flags control whether Google Test prints the elapsed time for each
 // test.
 GTEST_DECLARE_bool_(print_time);
@@ -131,6 +126,12 @@ GTEST_DECLARE_int32_(random_seed);
 // is 1. If the value is -1 the tests are repeating forever.
 GTEST_DECLARE_int32_(repeat);
 
+// This flag controls whether Google Test Environments are recreated for each
+// repeat of the tests. The default value is true. If set to false the global
+// test Environment objects are only set up once, for the first iteration, and
+// only torn down once, for the last.
+GTEST_DECLARE_bool_(recreate_environments_when_repeating);
+
 // This flag controls whether Google Test includes Google Test internal
 // stack frames in failure stack traces.
 GTEST_DECLARE_bool_(show_internal_stack_frames);
@@ -156,6 +157,16 @@ GTEST_DECLARE_string_(stream_result_to);
 GTEST_DECLARE_string_(flagfile);
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4805)
+#pragma warning(disable : 4100)
+#endif
+
 // The upper limit for valid stack trace depths.
 const int kMaxStackTraceDepth = 100;
 
@@ -177,6 +188,7 @@ class FuchsiaDeathTest;
 class UnitTestImpl* GetUnitTestImpl();
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
                                     const std::string& message);
+std::set<std::string>* GetIgnoredParameterizedTestSuites();
 
 }  // namespace internal
 
@@ -193,189 +205,6 @@ using TestCase = TestSuite;
 class TestInfo;
 class UnitTest;
 
-// A class for indicating whether an assertion was successful.  When
-// the assertion wasn't successful, the AssertionResult object
-// remembers a non-empty message that describes how it failed.
-//
-// To create an instance of this class, use one of the factory functions
-// (AssertionSuccess() and AssertionFailure()).
-//
-// This class is useful for two purposes:
-//   1. Defining predicate functions to be used with Boolean test assertions
-//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
-//   2. Defining predicate-format functions to be
-//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
-//
-// For example, if you define IsEven predicate:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
-// will print the message
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false (5 is odd)
-//   Expected: true
-//
-// instead of a more opaque
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false
-//   Expected: true
-//
-// in case IsEven is a simple Boolean predicate.
-//
-// If you expect your predicate to be reused and want to support informative
-// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
-// about half as often as positive ones in our tests), supply messages for
-// both success and failure cases:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess() << n << " is even";
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
-//
-//   Value of: IsEven(Fib(6))
-//     Actual: true (8 is even)
-//   Expected: false
-//
-// NB: Predicates that support negative Boolean assertions have reduced
-// performance in positive ones so be careful not to use them in tests
-// that have lots (tens of thousands) of positive Boolean assertions.
-//
-// To use this class with EXPECT_PRED_FORMAT assertions such as:
-//
-//   // Verifies that Foo() returns an even number.
-//   EXPECT_PRED_FORMAT1(IsEven, Foo());
-//
-// you need to define:
-//
-//   testing::AssertionResult IsEven(const char* expr, int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure()
-//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
-//   }
-//
-// If Foo() returns 5, you will see the following message:
-//
-//   Expected: Foo() is even
-//     Actual: it's 5
-//
-class GTEST_API_ AssertionResult {
- public:
-  // Copy constructor.
-  // Used in EXPECT_TRUE/FALSE(assertion_result).
-  AssertionResult(const AssertionResult& other);
-
-#if defined(_MSC_VER) && _MSC_VER < 1910
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
-#endif
-
-  // Used in the EXPECT_TRUE/FALSE(bool_expression).
-  //
-  // T must be contextually convertible to bool.
-  //
-  // The second parameter prevents this overload from being considered if
-  // the argument is implicitly convertible to AssertionResult. In that case
-  // we want AssertionResult's copy constructor to be used.
-  template <typename T>
-  explicit AssertionResult(
-      const T& success,
-      typename std::enable_if<
-          !std::is_convertible<T, AssertionResult>::value>::type*
-      /*enabler*/
-      = nullptr)
-      : success_(success) {}
-
-#if defined(_MSC_VER) && _MSC_VER < 1910
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
-#endif
-
-  // Assignment operator.
-  AssertionResult& operator=(AssertionResult other) {
-    swap(other);
-    return *this;
-  }
-
-  // Returns true if and only if the assertion succeeded.
-  operator bool() const { return success_; }  // NOLINT
-
-  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-  AssertionResult operator!() const;
-
-  // Returns the text streamed into this AssertionResult. Test assertions
-  // use it when they fail (i.e., the predicate's outcome doesn't match the
-  // assertion's expectation). When nothing has been streamed into the
-  // object, returns an empty string.
-  const char* message() const {
-    return message_.get() != nullptr ? message_->c_str() : "";
-  }
-  // Deprecated; please use message() instead.
-  const char* failure_message() const { return message(); }
-
-  // Streams a custom failure message into this object.
-  template <typename T> AssertionResult& operator<<(const T& value) {
-    AppendMessage(Message() << value);
-    return *this;
-  }
-
-  // Allows streaming basic output manipulators such as endl or flush into
-  // this object.
-  AssertionResult& operator<<(
-      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
-    AppendMessage(Message() << basic_manipulator);
-    return *this;
-  }
-
- private:
-  // Appends the contents of message to message_.
-  void AppendMessage(const Message& a_message) {
-    if (message_.get() == nullptr) message_.reset(new ::std::string);
-    message_->append(a_message.GetString().c_str());
-  }
-
-  // Swap the contents of this AssertionResult with other.
-  void swap(AssertionResult& other);
-
-  // Stores result of the assertion predicate.
-  bool success_;
-  // Stores the message describing the condition in case the expectation
-  // construct is not satisfied with the predicate's outcome.
-  // Referenced via a pointer to avoid taking too much stack frame space
-  // with test assertions.
-  std::unique_ptr< ::std::string> message_;
-};
-
-// Makes a successful assertion result.
-GTEST_API_ AssertionResult AssertionSuccess();
-
-// Makes a failed assertion result.
-GTEST_API_ AssertionResult AssertionFailure();
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
-
-}  // namespace testing
-
-// Includes the auto-generated header that implements a family of generic
-// predicate assertion macros. This include comes late because it relies on
-// APIs declared above.
-#include "gtest/gtest_pred_impl.h"
-
-namespace testing {
-
 // The abstract class that all tests inherit from.
 //
 // In Google Test, a unit test program contains one or many TestSuites, and
@@ -406,27 +235,24 @@ class GTEST_API_ Test {
   // The d'tor is virtual as we intend to inherit from Test.
   virtual ~Test();
 
-  // Sets up the stuff shared by all tests in this test case.
+  // Sets up the stuff shared by all tests in this test suite.
   //
   // Google Test will call Foo::SetUpTestSuite() before running the first
-  // test in test case Foo.  Hence a sub-class can define its own
+  // test in test suite Foo.  Hence a sub-class can define its own
   // SetUpTestSuite() method to shadow the one defined in the super
   // class.
-  // Failures that happen during SetUpTestSuite are logged but otherwise
-  // ignored.
   static void SetUpTestSuite() {}
 
   // Tears down the stuff shared by all tests in this test suite.
   //
   // Google Test will call Foo::TearDownTestSuite() after running the last
-  // test in test case Foo.  Hence a sub-class can define its own
+  // test in test suite Foo.  Hence a sub-class can define its own
   // TearDownTestSuite() method to shadow the one defined in the super
   // class.
-  // Failures that happen during TearDownTestSuite are logged but otherwise
-  // ignored.
   static void TearDownTestSuite() {}
 
-  // Legacy API is deprecated but still available
+  // Legacy API is deprecated but still available. Use SetUpTestSuite and
+  // TearDownTestSuite instead.
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   static void TearDownTestCase() {}
   static void SetUpTestCase() {}
@@ -513,7 +339,8 @@ class GTEST_API_ Test {
   virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
 
   // We disallow copying Tests.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+  Test(const Test&) = delete;
+  Test& operator=(const Test&) = delete;
 };
 
 typedef internal::TimeInMillis TimeInMillis;
@@ -527,24 +354,17 @@ class TestProperty {
   // C'tor.  TestProperty does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestProperty object.
-  TestProperty(const std::string& a_key, const std::string& a_value) :
-    key_(a_key), value_(a_value) {
-  }
+  TestProperty(const std::string& a_key, const std::string& a_value)
+      : key_(a_key), value_(a_value) {}
 
   // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
+  const char* key() const { return key_.c_str(); }
 
   // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
+  const char* value() const { return value_.c_str(); }
 
   // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string& new_value) {
-    value_ = new_value;
-  }
+  void SetValue(const std::string& new_value) { value_ = new_value; }
 
  private:
   // The key supplied by the user.
@@ -664,7 +484,7 @@ class GTEST_API_ TestResult {
 
   // Protects mutable state of the property vector and of owned
   // properties, whose values may be updated.
-  internal::Mutex test_properites_mutex_;
+  internal::Mutex test_properties_mutex_;
 
   // The vector of TestPartResults
   std::vector<TestPartResult> test_part_results_;
@@ -678,7 +498,8 @@ class GTEST_API_ TestResult {
   TimeInMillis elapsed_time_;
 
   // We disallow copying TestResult.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+  TestResult(const TestResult&) = delete;
+  TestResult& operator=(const TestResult&) = delete;
 };  // class TestResult
 
 // A TestInfo object stores the following information about a test:
@@ -794,13 +615,16 @@ class GTEST_API_ TestInfo {
   // deletes it.
   void Run();
 
+  // Skip and records the test result for this object.
+  void Skip();
+
   static void ClearTestResult(TestInfo* test_info) {
     test_info->result_.Clear();
   }
 
   // These fields are immutable properties of the test.
-  const std::string test_suite_name_;    // test suite name
-  const std::string name_;               // Test name
+  const std::string test_suite_name_;  // test suite name
+  const std::string name_;             // Test name
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
   const std::unique_ptr<const ::std::string> type_param_;
@@ -821,7 +645,8 @@ class GTEST_API_ TestInfo {
   // test for the second time.
   TestResult result_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+  TestInfo(const TestInfo&) = delete;
+  TestInfo& operator=(const TestInfo&) = delete;
 };
 
 // A test suite, which consists of a vector of TestInfos.
@@ -889,7 +714,9 @@ class GTEST_API_ TestSuite {
   bool Passed() const { return !Failed(); }
 
   // Returns true if and only if the test suite failed.
-  bool Failed() const { return failed_test_count() > 0; }
+  bool Failed() const {
+    return failed_test_count() > 0 || ad_hoc_test_result().Failed();
+  }
 
   // Returns the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
@@ -927,7 +754,7 @@ class GTEST_API_ TestSuite {
 
   // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
   // destruction of the TestSuite object.
-  void AddTestInfo(TestInfo * test_info);
+  void AddTestInfo(TestInfo* test_info);
 
   // Clears the results of all tests in this test suite.
   void ClearResult();
@@ -940,6 +767,9 @@ class GTEST_API_ TestSuite {
   // Runs every test in this TestSuite.
   void Run();
 
+  // Skips the execution of tests under this TestSuite
+  void Skip();
+
   // Runs SetUpTestSuite() for this TestSuite.  This wrapper is needed
   // for catching exceptions thrown from SetUpTestSuite().
   void RunSetUpTestSuite() {
@@ -1025,7 +855,8 @@ class GTEST_API_ TestSuite {
   TestResult ad_hoc_test_result_;
 
   // We disallow copying TestSuites.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite);
+  TestSuite(const TestSuite&) = delete;
+  TestSuite& operator=(const TestSuite&) = delete;
 };
 
 // An Environment object is capable of setting up and tearing down an
@@ -1052,6 +883,7 @@ class Environment {
 
   // Override this to define how to tear down the environment.
   virtual void TearDown() {}
+
  private:
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
@@ -1103,6 +935,9 @@ class TestEventListener {
   // Fired before the test starts.
   virtual void OnTestStart(const TestInfo& test_info) = 0;
 
+  // Fired when a test is disabled
+  virtual void OnTestDisabled(const TestInfo& /*test_info*/) {}
+
   // Fired after a failed assertion or a SUCCEED() invocation.
   // If you want to throw an exception from this function to skip to the next
   // TEST, it must be AssertionException defined above, or inherited from it.
@@ -1126,8 +961,7 @@ class TestEventListener {
   virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
 
   // Fired after each iteration of tests finishes.
-  virtual void OnTestIterationEnd(const UnitTest& unit_test,
-                                  int iteration) = 0;
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0;
 
   // Fired after all test activities have ended.
   virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
@@ -1152,6 +986,7 @@ class EmptyTestEventListener : public TestEventListener {
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
   void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
   void OnTestEnd(const TestInfo& /*test_info*/) override {}
   void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
@@ -1241,7 +1076,8 @@ class GTEST_API_ TestEventListeners {
   TestEventListener* default_xml_generator_;
 
   // We disallow copying TestEventListeners.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+  TestEventListeners(const TestEventListeners&) = delete;
+  TestEventListeners& operator=(const TestEventListeners&) = delete;
 };
 
 // A UnitTest consists of a vector of TestSuites.
@@ -1284,8 +1120,7 @@ class GTEST_API_ UnitTest {
 
   // Returns the TestInfo object for the test that's currently running,
   // or NULL if no test is running.
-  const TestInfo* current_test_info() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
@@ -1391,8 +1226,7 @@ class GTEST_API_ UnitTest {
   // eventually call this to report their results.  The user code
   // should use the assertion macros instead of calling this directly.
   void AddTestPartResult(TestPartResult::Type result_type,
-                         const char* file_name,
-                         int line_number,
+                         const char* file_name, int line_number,
                          const std::string& message,
                          const std::string& os_stack_trace)
       GTEST_LOCK_EXCLUDED_(mutex_);
@@ -1420,10 +1254,10 @@ class GTEST_API_ UnitTest {
   friend class internal::StreamingListenerTest;
   friend class internal::UnitTestRecordPropertyTestHelper;
   friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend std::set<std::string>* internal::GetIgnoredParameterizedTestSuites();
   friend internal::UnitTestImpl* internal::GetUnitTestImpl();
   friend void internal::ReportFailureInUnknownLocation(
-      TestPartResult::Type result_type,
-      const std::string& message);
+      TestPartResult::Type result_type, const std::string& message);
 
   // Creates an empty UnitTest.
   UnitTest();
@@ -1437,8 +1271,7 @@ class GTEST_API_ UnitTest {
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace()
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Protects mutable state in *impl_.  This is mutable as some const
   // methods need to lock it too.
@@ -1451,7 +1284,8 @@ class GTEST_API_ UnitTest {
   internal::UnitTestImpl* impl_;
 
   // We disallow copying UnitTest.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+  UnitTest(const UnitTest&) = delete;
+  UnitTest& operator=(const UnitTest&) = delete;
 };
 
 // A convenient wrapper for adding an environment for the test
@@ -1502,13 +1336,11 @@ namespace internal {
 // when calling EXPECT_* in a tight loop.
 template <typename T1, typename T2>
 AssertionResult CmpHelperEQFailure(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const T1& lhs, const T2& rhs) {
-  return EqFailure(lhs_expression,
-                   rhs_expression,
+                                   const char* rhs_expression, const T1& lhs,
+                                   const T2& rhs) {
+  return EqFailure(lhs_expression, rhs_expression,
                    FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
+                   FormatForComparisonFailureMessage(rhs, lhs), false);
 }
 
 // This block of code defines operator==/!=
@@ -1521,8 +1353,7 @@ inline bool operator!=(faketype, faketype) { return false; }
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
 AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            const T1& lhs,
+                            const char* rhs_expression, const T1& lhs,
                             const T2& rhs) {
   if (lhs == rhs) {
     return AssertionSuccess();
@@ -1531,14 +1362,6 @@ AssertionResult CmpHelperEQ(const char* lhs_expression,
   return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
 
-// With this overloaded version, we allow anonymous enums to be used
-// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
-// can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
-                                       const char* rhs_expression,
-                                       BiggestInt lhs,
-                                       BiggestInt rhs);
-
 class EqHelper {
  public:
   // This templatized version is for the general case.
@@ -1561,8 +1384,7 @@ class EqHelper {
   // Even though its body looks the same as the above version, we
   // cannot merge the two, as it will make anonymous enums unhappy.
   static AssertionResult Compare(const char* lhs_expression,
-                                 const char* rhs_expression,
-                                 BiggestInt lhs,
+                                 const char* rhs_expression, BiggestInt lhs,
                                  BiggestInt rhs) {
     return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
@@ -1595,38 +1417,31 @@ AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
 // ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
 // of similar code.
 //
-// For each templatized helper function, we also define an overloaded
-// version for BiggestInt in order to reduce code bloat and allow
-// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
-// with gcc 4.
-//
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-template <typename T1, typename T2>\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   const T1& val1, const T2& val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
-  }\
-}\
-GTEST_API_ AssertionResult CmpHelper##op_name(\
-    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                \
+  template <typename T1, typename T2>                                      \
+  AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                     const T1& val1, const T2& val2) {     \
+    if (val1 op val2) {                                                    \
+      return AssertionSuccess();                                           \
+    } else {                                                               \
+      return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);            \
+    }                                                                      \
+  }
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
 // Implements the helper function for {ASSERT|EXPECT}_NE
-GTEST_IMPL_CMP_HELPER_(NE, !=);
+GTEST_IMPL_CMP_HELPER_(NE, !=)
 // Implements the helper function for {ASSERT|EXPECT}_LE
-GTEST_IMPL_CMP_HELPER_(LE, <=);
+GTEST_IMPL_CMP_HELPER_(LE, <=)
 // Implements the helper function for {ASSERT|EXPECT}_LT
-GTEST_IMPL_CMP_HELPER_(LT, <);
+GTEST_IMPL_CMP_HELPER_(LT, <)
 // Implements the helper function for {ASSERT|EXPECT}_GE
-GTEST_IMPL_CMP_HELPER_(GE, >=);
+GTEST_IMPL_CMP_HELPER_(GE, >=)
 // Implements the helper function for {ASSERT|EXPECT}_GT
-GTEST_IMPL_CMP_HELPER_(GT, >);
+GTEST_IMPL_CMP_HELPER_(GT, >)
 
 #undef GTEST_IMPL_CMP_HELPER_
 
@@ -1635,49 +1450,42 @@ GTEST_IMPL_CMP_HELPER_(GT, >);
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
                                           const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
                                               const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
+                                              const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
                                               const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
-
+                                              const char* s1, const char* s2);
 
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
                                           const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+                                          const wchar_t* s1, const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+                                          const wchar_t* s1, const wchar_t* s2);
 
 }  // namespace internal
 
@@ -1689,32 +1497,40 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 //
 // The {needle,haystack}_expr arguments are the stringified
 // expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const char* needle,
+                                       const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const wchar_t* needle,
+                                       const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const char* needle,
+                                          const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const wchar_t* needle,
+                                          const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::string& needle,
+                                       const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::string& needle,
+                                          const ::std::string& haystack);
 
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::wstring& needle,
+                                       const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::wstring& needle,
+                                          const ::std::wstring& haystack);
 #endif  // GTEST_HAS_STD_WSTRING
 
 namespace internal {
@@ -1729,8 +1545,7 @@ namespace internal {
 template <typename RawType>
 AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
                                          const char* rhs_expression,
-                                         RawType lhs_value,
-                                         RawType rhs_value) {
+                                         RawType lhs_value, RawType rhs_value) {
   const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
@@ -1745,10 +1560,8 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
   rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
          << rhs_value;
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   StringStreamToString(&lhs_ss),
-                   StringStreamToString(&rhs_ss),
+  return EqFailure(lhs_expression, rhs_expression,
+                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
                    false);
 }
 
@@ -1758,8 +1571,7 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
 GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
                                                 const char* expr2,
                                                 const char* abs_error_expr,
-                                                double val1,
-                                                double val2,
+                                                double val1, double val2,
                                                 double abs_error);
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -1767,9 +1579,7 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
 class GTEST_API_ AssertHelper {
  public:
   // Constructor.
-  AssertHelper(TestPartResult::Type type,
-               const char* file,
-               int line,
+  AssertHelper(TestPartResult::Type type, const char* file, int line,
                const char* message);
   ~AssertHelper();
 
@@ -1783,11 +1593,9 @@ class GTEST_API_ AssertHelper {
   // re-using stack space even for temporary variables, so every EXPECT_EQ
   // reserves stack space for another AssertHelper.
   struct AssertHelperData {
-    AssertHelperData(TestPartResult::Type t,
-                     const char* srcfile,
-                     int line_num,
+    AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num,
                      const char* msg)
-        : type(t), file(srcfile), line(line_num), message(msg) { }
+        : type(t), file(srcfile), line(line_num), message(msg) {}
 
     TestPartResult::Type const type;
     const char* const file;
@@ -1795,20 +1603,16 @@ class GTEST_API_ AssertHelper {
     std::string const message;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+    AssertHelperData(const AssertHelperData&) = delete;
+    AssertHelperData& operator=(const AssertHelperData&) = delete;
   };
 
   AssertHelperData* const data_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+  AssertHelper(const AssertHelper&) = delete;
+  AssertHelper& operator=(const AssertHelper&) = delete;
 };
 
-enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW };
-
-GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color,
-                                                            const char* fmt,
-                                                            ...);
-
 }  // namespace internal
 
 // The pure interface class that all value-parameterized tests inherit from.
@@ -1863,15 +1667,14 @@ class WithParamInterface {
  private:
   // Sets parameter value. The caller is responsible for making sure the value
   // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
+  static void SetParam(const ParamType* parameter) { parameter_ = parameter; }
 
   // Static value used for accessing parameter during a test lifetime.
   static const ParamType* parameter_;
 
   // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
+  template <class TestClass>
+  friend class internal::ParameterizedTestFactory;
 };
 
 template <typename T>
@@ -1881,15 +1684,14 @@ const T* WithParamInterface<T>::parameter_ = nullptr;
 // WithParamInterface, and can just inherit from ::testing::TestWithParam.
 
 template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
+class TestWithParam : public Test, public WithParamInterface<T> {};
 
 // Macros for indicating success/failure in test code.
 
 // Skips test in runtime.
 // Skipping test aborts current function.
 // Skipped tests are neither successful nor failed.
-#define GTEST_SKIP() GTEST_SKIP_("Skipped")
+#define GTEST_SKIP() GTEST_SKIP_("")
 
 // ADD_FAILURE unconditionally adds a failure to the current test.
 // SUCCEED generates a success - it doesn't automatically make the
@@ -1913,7 +1715,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 
 // Generates a nonfatal failure at the given source file location with
 // a generic message.
-#define ADD_FAILURE_AT(file, line) \
+#define ADD_FAILURE_AT(file, line)        \
   GTEST_MESSAGE_AT_(file, line, "Failed", \
                     ::testing::TestPartResult::kNonFatalFailure)
 
@@ -1928,7 +1730,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Define this macro to 1 to omit the definition of FAIL(), which is a
 // generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
+#define FAIL() GTEST_FAIL()
 #endif
 
 // Generates a success with a generic message.
@@ -1937,7 +1739,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Define this macro to 1 to omit the definition of SUCCEED(), which
 // is a generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
+#define SUCCEED() GTEST_SUCCEED()
 #endif
 
 // Macros for testing exceptions.
@@ -1965,19 +1767,37 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Boolean assertions. Condition can be either a Boolean expression or an
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
-#define EXPECT_TRUE(condition) \
+#define GTEST_EXPECT_TRUE(condition)                      \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition) \
+#define GTEST_EXPECT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
-#define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition) \
+#define GTEST_ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define GTEST_ASSERT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
+// Define these macros to 1 to omit the definition of the corresponding
+// EXPECT or ASSERT, which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_EXPECT_TRUE
+#define EXPECT_TRUE(condition) GTEST_EXPECT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_EXPECT_FALSE
+#define EXPECT_FALSE(condition) GTEST_EXPECT_FALSE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_TRUE
+#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_FALSE
+#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition)
+#endif
+
 // Macros for testing equalities and inequalities.
 //
 //    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
@@ -2054,27 +1874,27 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // ASSERT_XY(), which clashes with some users' own code.
 
 #if !GTEST_DONT_DEFINE_ASSERT_EQ
-# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_NE
-# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LE
-# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LT
-# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GE
-# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GT
-# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
 #endif
 
 // C-string Comparisons.  All tests treat NULL and any non-NULL string
@@ -2099,7 +1919,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define EXPECT_STRCASEEQ(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define EXPECT_STRCASENE(s1, s2)\
+#define EXPECT_STRCASENE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 #define ASSERT_STREQ(s1, s2) \
@@ -2108,7 +1928,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define ASSERT_STRCASEEQ(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define ASSERT_STRCASENE(s1, s2)\
+#define ASSERT_STRCASENE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
@@ -2125,29 +1945,29 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(val1, val2)\
+#define EXPECT_FLOAT_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define EXPECT_DOUBLE_EQ(val1, val2)\
+#define EXPECT_DOUBLE_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define ASSERT_FLOAT_EQ(val1, val2)\
+#define ASSERT_FLOAT_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define ASSERT_DOUBLE_EQ(val1, val2)\
+#define ASSERT_DOUBLE_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define EXPECT_NEAR(val1, val2, abs_error)\
-  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define EXPECT_NEAR(val1, val2, abs_error)                                   \
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
-#define ASSERT_NEAR(val1, val2, abs_error)\
-  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define ASSERT_NEAR(val1, val2, abs_error)                                   \
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
 // These predicate format functions work on floating-point values, and
 // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
@@ -2161,7 +1981,6 @@ GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
 GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
                                     double val1, double val2);
 
-
 #if GTEST_OS_WINDOWS
 
 // Macros that test for HRESULT failure and success, these are only useful
@@ -2173,17 +1992,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 // expected result and the actual result with both a human-readable
 // string representation of the error, if available, as well as the
 // hex result code.
-# define EXPECT_HRESULT_SUCCEEDED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define EXPECT_HRESULT_SUCCEEDED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define ASSERT_HRESULT_SUCCEEDED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define ASSERT_HRESULT_SUCCEEDED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define EXPECT_HRESULT_FAILED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define EXPECT_HRESULT_FAILED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
-# define ASSERT_HRESULT_FAILED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define ASSERT_HRESULT_FAILED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
 #endif  // GTEST_OS_WINDOWS
 
@@ -2198,9 +2017,9 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 //   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
 //
 #define ASSERT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
 #define EXPECT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
 
 // Causes a trace (including the given source file path and line number,
 // and the given message) to be included in every test failure message generated
@@ -2242,7 +2061,8 @@ class GTEST_API_ ScopedTrace {
  private:
   void PushTrace(const char* file, int line, std::string message);
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+  ScopedTrace(const ScopedTrace&) = delete;
+  ScopedTrace& operator=(const ScopedTrace&) = delete;
 } GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
                             // c'tor and d'tor.  Therefore it doesn't
                             // need to be used otherwise.
@@ -2262,9 +2082,9 @@ class GTEST_API_ ScopedTrace {
 // Assuming that each thread maintains its own stack of traces.
 // Therefore, a SCOPED_TRACE() would (correctly) only affect the
 // assertions in its own thread.
-#define SCOPED_TRACE(message) \
-  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, (message))
+#define SCOPED_TRACE(message)                                         \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
+      __FILE__, __LINE__, (message))
 
 // Compile-time assertion for type equality.
 // StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
@@ -2298,8 +2118,7 @@ class GTEST_API_ ScopedTrace {
 // to cause a compiler error.
 template <typename T1, typename T2>
 constexpr bool StaticAssertTypeEq() noexcept {
-  static_assert(std::is_same<T1, T2>::value,
-                "type1 and type2 are not the same type");
+  static_assert(std::is_same<T1, T2>::value, "T1 and T2 are not the same type");
   return true;
 }
 
@@ -2363,18 +2182,19 @@ constexpr bool StaticAssertTypeEq() noexcept {
 //     EXPECT_EQ(a_.size(), 0);
 //     EXPECT_EQ(b_.size(), 1);
 //   }
-//
-// GOOGLETEST_CM0011 DO NOT DELETE
-#define TEST_F(test_fixture, test_name)\
+#define GTEST_TEST_F(test_fixture, test_name)        \
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
+#if !GTEST_DONT_DEFINE_TEST_F
+#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name)
+#endif
 
 // Returns a path to temporary directory.
 // Tries to determine an appropriate directory for the platform.
 GTEST_API_ std::string TempDir();
 
 #ifdef _MSC_VER
-#  pragma warning(pop)
+#pragma warning(pop)
 #endif
 
 // Dynamically registers a test with the framework.
@@ -2428,6 +2248,7 @@ GTEST_API_ std::string TempDir();
 // }
 // ...
 // int main(int argc, char** argv) {
+//   ::testing::InitGoogleTest(&argc, argv);
 //   std::vector<int> values_to_test = LoadValuesFromConfig();
 //   RegisterMyTests(values_to_test);
 //   ...
@@ -2469,10 +2290,8 @@ TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
 // namespace and has an all-caps name.
 int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
 
-inline int RUN_ALL_TESTS() {
-  return ::testing::UnitTest::GetInstance()->Run();
-}
+inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_H_
diff --git a/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/third_party/googletest/src/include/gtest/gtest_pred_impl.h
index d514255c73..47a24aa687 100644
--- a/third_party/googletest/src/include/gtest/gtest_pred_impl.h
+++ b/third_party/googletest/src/include/gtest/gtest_pred_impl.h
@@ -26,17 +26,19 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
 //
 // Implements a family of generic predicate assertion macros.
-// GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-#include "gtest/gtest.h"
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
 
 namespace testing {
 
@@ -72,22 +74,18 @@ namespace testing {
 // GTEST_ASSERT_ is the basic statement to which all of the assertions
 // in this file reduce.  Don't use this in your code.
 
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+#define GTEST_ASSERT_(expression, on_failure)                   \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                 \
   if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
+    ;                                                           \
+  else                                                          \
     on_failure(gtest_ar.failure_message())
 
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
+template <typename Pred, typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text, const char* e1,
+                                  Pred pred, const T1& v1) {
   if (pred(v1)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -98,40 +96,27 @@ AssertionResult AssertPred1Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
+#define GTEST_PRED1_(pred, v1, on_failure) \
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
 
 // Unary predicate assertion macros.
 #define EXPECT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
 #define ASSERT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
+#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
 
 // Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
+template <typename Pred, typename T1, typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text, const char* e1,
+                                  const char* e2, Pred pred, const T1& v1,
                                   const T2& v2) {
   if (pred(v1, v2)) return AssertionSuccess();
 
@@ -145,19 +130,14 @@ AssertionResult AssertPred2Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
+#define GTEST_PRED2_(pred, v1, v2, on_failure)                               \
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
+                on_failure)
 
 // Binary predicate assertion macros.
 #define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
@@ -169,22 +149,12 @@ AssertionResult AssertPred2Helper(const char* pred_text,
 #define ASSERT_PRED2(pred, v1, v2) \
   GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
+template <typename Pred, typename T1, typename T2, typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3) {
   if (pred(v1, v2, v3)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -198,21 +168,15 @@ AssertionResult AssertPred3Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)                          \
+  GTEST_ASSERT_(                                                            \
+      ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
+      on_failure)
 
 // Ternary predicate assertion macros.
 #define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
@@ -224,25 +188,13 @@ AssertionResult AssertPred3Helper(const char* pred_text,
 #define ASSERT_PRED3(pred, v1, v2, v3) \
   GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
+template <typename Pred, typename T1, typename T2, typename T3, typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, Pred pred, const T1& v1,
+                                  const T2& v2, const T3& v3, const T4& v4) {
   if (pred(v1, v2, v3, v4)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -257,23 +209,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)                        \
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
+                                             v1, v2, v3, v4),                 \
+                on_failure)
 
 // 4-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
@@ -285,28 +229,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
 #define ASSERT_PRED4(pred, v1, v2, v3, v4) \
   GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
+template <typename Pred, typename T1, typename T2, typename T3, typename T4,
           typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
+AssertionResult AssertPred5Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, const char* e5, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3,
+                                  const T4& v4, const T5& v5) {
   if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -322,25 +253,16 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)  \
   GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)                   \
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
+                                             pred, v1, v2, v3, v4, v5),      \
+                on_failure)
 
 // 5-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
@@ -352,8 +274,6 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 #define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
   GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
 
-
-
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/third_party/googletest/src/include/gtest/gtest_prod.h b/third_party/googletest/src/include/gtest/gtest_prod.h
index e651671ebd..1f37dc31c3 100644
--- a/third_party/googletest/src/include/gtest/gtest_prod.h
+++ b/third_party/googletest/src/include/gtest/gtest_prod.h
@@ -27,12 +27,11 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
-// Google C++ Testing and Mocking Framework definitions useful in production code.
-// GOOGLETEST_CM0003 DO NOT DELETE
+// Google C++ Testing and Mocking Framework definitions useful in production
+// code.
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
 
 // When you need to test the private or protected members of a class,
 // use the FRIEND_TEST macro to declare your tests as friends of the
@@ -55,7 +54,7 @@
 // Note: The test class must be in the same namespace as the class being tested.
 // For example, putting MyClassTest in an anonymous namespace will not work.
 
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
+#define FRIEND_TEST(test_case_name, test_name) \
+  friend class test_case_name##_##test_name##_Test
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/third_party/googletest/src/include/gtest/internal/custom/README.md b/third_party/googletest/src/include/gtest/internal/custom/README.md
index ff391fb4e2..cb49e2c754 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/README.md
+++ b/third_party/googletest/src/include/gtest/internal/custom/README.md
@@ -15,18 +15,6 @@ The custom directory is an injection point for custom user configurations.
 
 The following macros can be defined:
 
-### Flag related macros:
-
-*   `GTEST_FLAG(flag_name)`
-*   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
-    own flagfile flag parsing.
-*   `GTEST_DECLARE_bool_(name)`
-*   `GTEST_DECLARE_int32_(name)`
-*   `GTEST_DECLARE_string_(name)`
-*   `GTEST_DEFINE_bool_(name, default_val, doc)`
-*   `GTEST_DEFINE_int32_(name, default_val, doc)`
-*   `GTEST_DEFINE_string_(name, default_val, doc)`
-
 ### Logging:
 
 *   `GTEST_LOG_(severity)`
diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
index cd85d956d2..9b7fb4261a 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
+++ b/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
@@ -31,7 +31,38 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+// Use a stub Notification class.
+//
+// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and
+// std::condition_variable. The <mutex> and <condition_variable> headers of
+// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only
+// when configured with the posix threads option but don't define them when
+// configured with the win32 threads option. The Notification class is only
+// used in GoogleTest's internal tests. Since we don't build GoogleTest's
+// internal tests, we don't need a working Notification class. Although it's
+// not hard to fix the mingw32 g++ compilation errors by implementing the
+// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE,
+// it's simpler to just use a stub Notification class on all platforms.
+//
+// The default constructor of the stub class is deleted and the declaration of
+// the Notify() method is commented out, so that compilation will fail if any
+// code actually uses the Notification class.
+
+#define GTEST_HAS_NOTIFICATION_ 1
+namespace testing {
+namespace internal {
+class Notification {
+ public:
+  Notification() = delete;
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
+  // void Notify();
+  void WaitForNotification() {}
+};
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
index eb4467abca..b9495d8378 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
+++ b/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
@@ -36,7 +36,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/third_party/googletest/src/include/gtest/internal/custom/gtest.h
index 4c8e07be23..afaaf17ba2 100644
--- a/third_party/googletest/src/include/gtest/internal/custom/gtest.h
+++ b/third_party/googletest/src/include/gtest/internal/custom/gtest.h
@@ -31,7 +31,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
index 68bd353061..45580ae805 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
@@ -26,27 +26,31 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines internal utilities needed for implementing
 // death tests.  They are subject to change without notice.
-// GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#include "gtest/gtest-matchers.h"
-#include "gtest/internal/gtest-internal.h"
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 
 #include <stdio.h>
+
 #include <memory>
 
-namespace testing {
-namespace internal {
+#include "gtest/gtest-matchers.h"
+#include "gtest/internal/gtest-internal.h"
 
 GTEST_DECLARE_string_(internal_run_death_test);
 
+namespace testing {
+namespace internal {
+
 // Names of the flags (needed for parsing Google Test flags).
 const char kDeathTestStyleFlag[] = "death_test_style";
 const char kDeathTestUseFork[] = "death_test_use_fork";
@@ -83,16 +87,18 @@ class GTEST_API_ DeathTest {
   static bool Create(const char* statement, Matcher<const std::string&> matcher,
                      const char* file, int line, DeathTest** test);
   DeathTest();
-  virtual ~DeathTest() { }
+  virtual ~DeathTest() {}
 
   // A helper class that aborts a death test when it's deleted.
   class ReturnSentinel {
    public:
-    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    explicit ReturnSentinel(DeathTest* test) : test_(test) {}
     ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+
    private:
     DeathTest* const test_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+    ReturnSentinel(const ReturnSentinel&) = delete;
+    ReturnSentinel& operator=(const ReturnSentinel&) = delete;
   } GTEST_ATTRIBUTE_UNUSED_;
 
   // An enumeration of possible roles that may be taken when a death
@@ -137,7 +143,8 @@ class GTEST_API_ DeathTest {
   // A string containing a description of the outcome of the last death test.
   static std::string last_death_test_message_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+  DeathTest(const DeathTest&) = delete;
+  DeathTest& operator=(const DeathTest&) = delete;
 };
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
@@ -145,7 +152,7 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
-  virtual ~DeathTestFactory() { }
+  virtual ~DeathTestFactory() {}
   virtual bool Create(const char* statement,
                       Matcher<const std::string&> matcher, const char* file,
                       int line, DeathTest** test) = 0;
@@ -186,28 +193,28 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
 
 // Traps C++ exceptions escaping statement and reports them as test
 // failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  try { \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } catch (const ::std::exception& gtest_exception) { \
-    fprintf(\
-        stderr, \
-        "\n%s: Caught std::exception-derived exception escaping the " \
-        "death test statement. Exception message: %s\n", \
+#if GTEST_HAS_EXCEPTIONS
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test)           \
+  try {                                                                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);               \
+  } catch (const ::std::exception& gtest_exception) {                        \
+    fprintf(                                                                 \
+        stderr,                                                              \
+        "\n%s: Caught std::exception-derived exception escaping the "        \
+        "death test statement. Exception message: %s\n",                     \
         ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what()); \
-    fflush(stderr); \
+        gtest_exception.what());                                             \
+    fflush(stderr);                                                          \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) { \
+  } catch (...) {                                                            \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
   }
 
-# else
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+#else
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 
-# endif
+#endif
 
 // This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
 // ASSERT_EXIT*, and EXPECT_EXIT*.
@@ -236,8 +243,6 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
           gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
           break;                                                               \
         }                                                                      \
-        default:                                                               \
-          break;                                                               \
       }                                                                        \
     }                                                                          \
   } else                                                                       \
@@ -265,16 +270,12 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
 // RUN_ALL_TESTS was called.
 class InternalRunDeathTestFlag {
  public:
-  InternalRunDeathTestFlag(const std::string& a_file,
-                           int a_line,
-                           int an_index,
+  InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index,
                            int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index),
-        write_fd_(a_write_fd) {}
+      : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
 
   ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0)
-      posix::Close(write_fd_);
+    if (write_fd_ >= 0) posix::Close(write_fd_);
   }
 
   const std::string& file() const { return file_; }
@@ -288,7 +289,8 @@ class InternalRunDeathTestFlag {
   int index_;
   int write_fd_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+  InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete;
+  InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete;
 };
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
@@ -301,4 +303,4 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
index c11b101516..a2a60a962b 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Google Test filepath utilities
 //
 // This header file declares classes and functions used internally by
@@ -35,10 +35,12 @@
 // This file is #included in gtest/internal/gtest-internal.h.
 // Do not include this header file separately!
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 
 #include "gtest/internal/gtest-string.h"
 
@@ -61,8 +63,8 @@ namespace internal {
 
 class GTEST_API_ FilePath {
  public:
-  FilePath() : pathname_("") { }
-  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+  FilePath() : pathname_("") {}
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {}
 
   explicit FilePath(const std::string& pathname) : pathname_(pathname) {
     Normalize();
@@ -73,9 +75,7 @@ class GTEST_API_ FilePath {
     return *this;
   }
 
-  void Set(const FilePath& rhs) {
-    pathname_ = rhs.pathname_;
-  }
+  void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; }
 
   const std::string& string() const { return pathname_; }
   const char* c_str() const { return pathname_.c_str(); }
@@ -88,8 +88,7 @@ class GTEST_API_ FilePath {
   // than zero (e.g., 12), returns "dir/test_12.xml".
   // On Windows platform, uses \ as the separator rather than /.
   static FilePath MakeFileName(const FilePath& directory,
-                               const FilePath& base_name,
-                               int number,
+                               const FilePath& base_name, int number,
                                const char* extension);
 
   // Given directory = "dir", relative_path = "test.xml",
@@ -195,7 +194,7 @@ class GTEST_API_ FilePath {
 
   void Normalize();
 
-  // Returns a pointer to the last occurence of a valid path separator in
+  // Returns a pointer to the last occurrence of a valid path separator in
   // the FilePath. On Windows, for example, both '/' and '\' are valid path
   // separators. Returns NULL if no path separator was found.
   const char* FindLastPathSeparator() const;
@@ -208,4 +207,4 @@ class GTEST_API_ FilePath {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/third_party/googletest/src/include/gtest/internal/gtest-internal.h
index 94c816a28b..9b04e4c85f 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-internal.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-internal.h
@@ -26,33 +26,37 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares functions and macros used internally by
 // Google Test.  They are subject to change without notice.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #include <ctype.h>
 #include <float.h>
 #include <string.h>
+
+#include <cstdint>
 #include <iomanip>
 #include <limits>
 #include <map>
@@ -75,32 +79,43 @@
 // the current line number.  For more details, see
 // http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
 
 // Stringifies its argument.
-#define GTEST_STRINGIFY_(name) #name
+// Work around a bug in visual studio which doesn't accept code like this:
+//
+//   #define GTEST_STRINGIFY_(name) #name
+//   #define MACRO(a, b, c) ... GTEST_STRINGIFY_(a) ...
+//   MACRO(, x, y)
+//
+// Complaining about the argument to GTEST_STRINGIFY_ being empty.
+// This is allowed by the spec.
+#define GTEST_STRINGIFY_HELPER_(name, ...) #name
+#define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, )
 
-namespace proto2 { class Message; }
+namespace proto2 {
+class MessageLite;
+}
 
 namespace testing {
 
 // Forward declarations.
 
-class AssertionResult;                 // Result of an assertion.
-class Message;                         // Represents a failure message.
-class Test;                            // Represents a test.
-class TestInfo;                        // Information about a test.
-class TestPartResult;                  // Result of a test part.
-class UnitTest;                        // A collection of test suites.
+class AssertionResult;  // Result of an assertion.
+class Message;          // Represents a failure message.
+class Test;             // Represents a test.
+class TestInfo;         // Information about a test.
+class TestPartResult;   // Result of a test part.
+class UnitTest;         // A collection of test suites.
 
 template <typename T>
 ::std::string PrintToString(const T& value);
 
 namespace internal {
 
-struct TraceInfo;                      // Information about a trace point.
-class TestInfoImpl;                    // Opaque implementation of TestInfo
-class UnitTestImpl;                    // Opaque implementation of UnitTest
+struct TraceInfo;    // Information about a trace point.
+class TestInfoImpl;  // Opaque implementation of TestInfo
+class UnitTestImpl;  // Opaque implementation of UnitTest
 
 // The text used in failure messages to indicate the start of the
 // stack trace.
@@ -109,6 +124,7 @@ GTEST_API_ extern const char kStackTraceMarker[];
 // An IgnoredValue object can be implicitly constructed from ANY value.
 class IgnoredValue {
   struct Sink {};
+
  public:
   // This constructor template allows any value to be implicitly
   // converted to IgnoredValue.  The object has no data member and
@@ -124,13 +140,13 @@ class IgnoredValue {
 };
 
 // Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(
-    const std::string& gtest_msg, const Message& user_msg);
+GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg,
+                                         const Message& user_msg);
 
 #if GTEST_HAS_EXCEPTIONS
 
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
-/* an exported class was derived from a class that was not exported */)
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4275 /* an exported class was derived from a class that was not exported */)
 
 // This exception is thrown by (and only by) a failed Google Test
 // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
@@ -169,14 +185,6 @@ GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
 
 }  // namespace edit_distance
 
-// Calculate the diff between 'left' and 'right' and return it in unified diff
-// format.
-// If not null, stores in 'total_line_count' the total number of lines found
-// in left + right.
-GTEST_API_ std::string DiffStrings(const std::string& left,
-                                   const std::string& right,
-                                   size_t* total_line_count);
-
 // Constructs and returns the message for an equality assertion
 // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
 //
@@ -200,10 +208,8 @@ GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 GTEST_API_ std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value);
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value);
 
 // This template class represents an IEEE floating-point number
 // (either single-precision or double-precision, depending on the
@@ -244,11 +250,11 @@ class FloatingPoint {
   // Constants.
 
   // # of bits in a number.
-  static const size_t kBitCount = 8*sizeof(RawType);
+  static const size_t kBitCount = 8 * sizeof(RawType);
 
   // # of fraction bits in a number.
   static const size_t kFractionBitCount =
-    std::numeric_limits<RawType>::digits - 1;
+      std::numeric_limits<RawType>::digits - 1;
 
   // # of exponent bits in a number.
   static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
@@ -257,8 +263,8 @@ class FloatingPoint {
   static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
 
   // The mask for the fraction bits.
-  static const Bits kFractionBitMask =
-    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+  static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
+                                       (kExponentBitCount + 1);
 
   // The mask for the exponent bits.
   static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
@@ -275,7 +281,7 @@ class FloatingPoint {
   //
   // See the following article for more details on ULP:
   // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
-  static const size_t kMaxUlps = 4;
+  static const uint32_t kMaxUlps = 4;
 
   // Constructs a FloatingPoint from a raw floating-point number.
   //
@@ -297,9 +303,7 @@ class FloatingPoint {
   }
 
   // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() {
-    return ReinterpretBits(kExponentBitMask);
-  }
+  static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
 
   // Returns the maximum representable finite floating-point number.
   static RawType Max();
@@ -307,7 +311,7 @@ class FloatingPoint {
   // Non-static methods
 
   // Returns the bits that represents this number.
-  const Bits &bits() const { return u_.bits_; }
+  const Bits& bits() const { return u_.bits_; }
 
   // Returns the exponent bits of this number.
   Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
@@ -336,8 +340,8 @@ class FloatingPoint {
     // a NAN must return false.
     if (is_nan() || rhs.is_nan()) return false;
 
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
-        <= kMaxUlps;
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
+           kMaxUlps;
   }
 
  private:
@@ -362,7 +366,7 @@ class FloatingPoint {
   //
   // Read http://en.wikipedia.org/wiki/Signed_number_representations
   // for more details on signed number representations.
-  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+  static Bits SignAndMagnitudeToBiased(const Bits& sam) {
     if (kSignBitMask & sam) {
       // sam represents a negative number.
       return ~sam + 1;
@@ -374,8 +378,8 @@ class FloatingPoint {
 
   // Given two numbers in the sign-and-magnitude representation,
   // returns the distance between them as an unsigned number.
-  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
-                                                     const Bits &sam2) {
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1,
+                                                     const Bits& sam2) {
     const Bits biased1 = SignAndMagnitudeToBiased(sam1);
     const Bits biased2 = SignAndMagnitudeToBiased(sam2);
     return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
@@ -387,9 +391,13 @@ class FloatingPoint {
 // We cannot use std::numeric_limits<T>::max() as it clashes with the max()
 // macro defined by <windows.h>.
 template <>
-inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+inline float FloatingPoint<float>::Max() {
+  return FLT_MAX;
+}
 template <>
-inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+inline double FloatingPoint<double>::Max() {
+  return DBL_MAX;
+}
 
 // Typedefs the instances of the FloatingPoint template class that we
 // care to use.
@@ -449,7 +457,8 @@ class TestFactoryBase {
   TestFactoryBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+  TestFactoryBase(const TestFactoryBase&) = delete;
+  TestFactoryBase& operator=(const TestFactoryBase&) = delete;
 };
 
 // This class provides implementation of TeastFactoryBase interface.
@@ -498,16 +507,17 @@ inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
 
 template <typename T>
 //  Note that SuiteApiResolver inherits from T because
-//  SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way
+//  SetUpTestSuite()/TearDownTestSuite() could be protected. This way
 //  SuiteApiResolver can access them.
 struct SuiteApiResolver : T {
   // testing::Test is only forward declared at this point. So we make it a
-  // dependend class for the compiler to be OK with it.
+  // dependent class for the compiler to be OK with it.
   using Test =
       typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
 
   static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename,
                                                         int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
     SetUpTearDownSuiteFuncType test_case_fp =
         GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
     SetUpTearDownSuiteFuncType test_suite_fp =
@@ -519,10 +529,16 @@ struct SuiteApiResolver : T {
         << filename << ":" << line_num;
 
     return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::SetUpTestSuite;
+#endif
   }
 
   static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename,
                                                            int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
     SetUpTearDownSuiteFuncType test_case_fp =
         GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
     SetUpTearDownSuiteFuncType test_suite_fp =
@@ -534,6 +550,11 @@ struct SuiteApiResolver : T {
         << filename << ":" << line_num;
 
     return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::TearDownTestSuite;
+#endif
   }
 };
 
@@ -542,11 +563,11 @@ struct SuiteApiResolver : T {
 //
 // Arguments:
 //
-//   test_suite_name:   name of the test suite
+//   test_suite_name:  name of the test suite
 //   name:             name of the test
-//   type_param        the name of the test's type parameter, or NULL if
+//   type_param:       the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
-//   value_param       text representation of the test's value parameter,
+//   value_param:      text representation of the test's value parameter,
 //                     or NULL if this is not a type-parameterized test.
 //   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
@@ -566,8 +587,6 @@ GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
 // and returns false.  None of pstr, *pstr, and prefix can be NULL.
 GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
 
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
@@ -607,8 +626,9 @@ class GTEST_API_ TypedTestSuitePState {
   // Verifies that registered_tests match the test names in
   // defined_test_names_; returns registered_tests if successful, or
   // aborts the program otherwise.
-  const char* VerifyRegisteredTestNames(
-      const char* file, int line, const char* registered_tests);
+  const char* VerifyRegisteredTestNames(const char* test_suite_name,
+                                        const char* file, int line,
+                                        const char* registered_tests);
 
  private:
   typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
@@ -631,7 +651,8 @@ inline const char* SkipComma(const char* str) {
   if (comma == nullptr) {
     return nullptr;
   }
-  while (IsSpace(*(++comma))) {}
+  while (IsSpace(*(++comma))) {
+  }
   return comma;
 }
 
@@ -645,7 +666,7 @@ inline std::string GetPrefixUntilComma(const char* str) {
 // Splits a given string on a given delimiter, populating a given
 // vector with the fields.
 void SplitString(const ::std::string& str, char delimiter,
-                 ::std::vector< ::std::string>* dest);
+                 ::std::vector<::std::string>* dest);
 
 // The default argument to the template below for the case when the user does
 // not provide a name generator.
@@ -662,7 +683,7 @@ struct NameGeneratorSelector {
 };
 
 template <typename NameGenerator>
-void GenerateNamesRecursively(Types0, std::vector<std::string>*, int) {}
+void GenerateNamesRecursively(internal::None, std::vector<std::string>*, int) {}
 
 template <typename NameGenerator, typename Types>
 void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
@@ -729,7 +750,7 @@ class TypeParameterizedTest {
 
 // The base case for the compile time recursion.
 template <GTEST_TEMPLATE_ Fixture, class TestSel>
-class TypeParameterizedTest<Fixture, TestSel, Types0> {
+class TypeParameterizedTest<Fixture, TestSel, internal::None> {
  public:
   static bool Register(const char* /*prefix*/, const CodeLocation&,
                        const char* /*case_name*/, const char* /*test_names*/,
@@ -740,6 +761,11 @@ class TypeParameterizedTest<Fixture, TestSel, Types0> {
   }
 };
 
+GTEST_API_ void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
+                                                   CodeLocation code_location);
+GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation(
+    const char* case_name);
+
 // TypeParameterizedTestSuite<Fixture, Tests, Types>::Register()
 // registers *all combinations* of 'Tests' and 'Types' with Google
 // Test.  The return value is insignificant - we just need to return
@@ -752,13 +778,14 @@ class TypeParameterizedTestSuite {
                        const char* test_names,
                        const std::vector<std::string>& type_names =
                            GenerateNames<DefaultNameGenerator, Types>()) {
-    std::string test_name = StripTrailingSpaces(
-        GetPrefixUntilComma(test_names));
+    RegisterTypeParameterizedTestSuiteInstantiation(case_name);
+    std::string test_name =
+        StripTrailingSpaces(GetPrefixUntilComma(test_names));
     if (!state->TestExists(test_name)) {
       fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
               case_name, test_name.c_str(),
-              FormatFileLocation(code_location.file.c_str(),
-                                 code_location.line).c_str());
+              FormatFileLocation(code_location.file.c_str(), code_location.line)
+                  .c_str());
       fflush(stderr);
       posix::Abort();
     }
@@ -781,7 +808,7 @@ class TypeParameterizedTestSuite {
 
 // The base case for the compile time recursion.
 template <GTEST_TEMPLATE_ Fixture, typename Types>
-class TypeParameterizedTestSuite<Fixture, Templates0, Types> {
+class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
  public:
   static bool Register(const char* /*prefix*/, const CodeLocation&,
                        const TypedTestSuitePState* /*state*/,
@@ -792,8 +819,6 @@ class TypeParameterizedTestSuite<Fixture, Templates0, Types> {
   }
 };
 
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 // Returns the current OS stack trace as an std::string.
 //
 // The maximum number of stack frames to be included is specified by
@@ -804,8 +829,8 @@ class TypeParameterizedTestSuite<Fixture, Templates0, Types> {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
-    UnitTest* unit_test, int skip_count);
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test,
+                                                       int skip_count);
 
 // Helpers for suppressing warnings on unreachable code or constant
 // condition.
@@ -825,6 +850,16 @@ struct GTEST_API_ ConstCharPtr {
   const char* value;
 };
 
+// Helper for declaring std::string within 'if' statement
+// in pre C++17 build environment.
+struct TrueWithString {
+  TrueWithString() = default;
+  explicit TrueWithString(const char* str) : value(str) {}
+  explicit TrueWithString(const std::string& str) : value(str) {}
+  explicit operator bool() const { return true; }
+  std::string value;
+};
+
 // A simple Linear Congruential Generator for generating random
 // numbers with a uniform distribution.  Unlike rand() and srand(), it
 // doesn't use global state (and therefore can't interfere with user
@@ -832,31 +867,54 @@ struct GTEST_API_ ConstCharPtr {
 // but it's good enough for our purposes.
 class GTEST_API_ Random {
  public:
-  static const UInt32 kMaxRange = 1u << 31;
+  static const uint32_t kMaxRange = 1u << 31;
 
-  explicit Random(UInt32 seed) : state_(seed) {}
+  explicit Random(uint32_t seed) : state_(seed) {}
 
-  void Reseed(UInt32 seed) { state_ = seed; }
+  void Reseed(uint32_t seed) { state_ = seed; }
 
   // Generates a random number from [0, range).  Crashes if 'range' is
   // 0 or greater than kMaxRange.
-  UInt32 Generate(UInt32 range);
+  uint32_t Generate(uint32_t range);
 
  private:
-  UInt32 state_;
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+  uint32_t state_;
+  Random(const Random&) = delete;
+  Random& operator=(const Random&) = delete;
 };
 
 // Turns const U&, U&, const U, and U all into U.
 #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
   typename std::remove_const<typename std::remove_reference<T>::type>::type
 
-// IsAProtocolMessage<T>::value is a compile-time bool constant that's
-// true if and only if T is type proto2::Message or a subclass of it.
+// HasDebugStringAndShortDebugString<T>::value is a compile-time bool constant
+// that's true if and only if T has methods DebugString() and ShortDebugString()
+// that return std::string.
+template <typename T>
+class HasDebugStringAndShortDebugString {
+ private:
+  template <typename C>
+  static auto CheckDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().DebugString())>::type;
+  template <typename>
+  static std::false_type CheckDebugString(...);
+
+  template <typename C>
+  static auto CheckShortDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().ShortDebugString())>::type;
+  template <typename>
+  static std::false_type CheckShortDebugString(...);
+
+  using HasDebugStringType = decltype(CheckDebugString<T>(nullptr));
+  using HasShortDebugStringType = decltype(CheckShortDebugString<T>(nullptr));
+
+ public:
+  static constexpr bool value =
+      HasDebugStringType::value && HasShortDebugStringType::value;
+};
+
 template <typename T>
-struct IsAProtocolMessage
-    : public bool_constant<
-          std::is_convertible<const T*, const ::proto2::Message*>::value> {};
+constexpr bool HasDebugStringAndShortDebugString<T>::value;
 
 // When the compiler sees expression IsContainerTest<C>(0), if C is an
 // STL-style container class, the first overload of IsContainerTest
@@ -895,7 +953,9 @@ IsContainer IsContainerTest(int /* dummy */) {
 
 typedef char IsNotContainer;
 template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+IsNotContainer IsContainerTest(long /* dummy */) {
+  return '\0';
+}
 
 // Trait to detect whether a type T is a hash table.
 // The heuristic used is that the type contains an inner type `hasher` and does
@@ -958,11 +1018,13 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+inline bool ArrayEq(const T& lhs, const U& rhs) {
+  return lhs == rhs;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
   return internal::ArrayEq(lhs, N, rhs);
 }
 
@@ -972,8 +1034,7 @@ inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
 template <typename T, typename U>
 bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
   for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i]))
-      return false;
+    if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
   }
   return true;
 }
@@ -983,8 +1044,7 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
 template <typename Iter, typename Element>
 Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
   for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem))
-      return it;
+    if (internal::ArrayEq(*it, elem)) return it;
   }
   return end;
 }
@@ -998,11 +1058,13 @@ void CopyArray(const T* from, size_t size, U* to);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
+inline void CopyArray(const T& from, U* to) {
+  *to = from;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+inline void CopyArray(const T (&from)[N], U (*to)[N]) {
   internal::CopyArray(from, N, *to);
 }
 
@@ -1055,8 +1117,7 @@ class NativeArray {
   }
 
   ~NativeArray() {
-    if (clone_ != &NativeArray::InitRef)
-      delete[] array_;
+    if (clone_ != &NativeArray::InitRef) delete[] array_;
   }
 
   // STL-style container methods.
@@ -1064,8 +1125,7 @@ class NativeArray {
   const_iterator begin() const { return array_; }
   const_iterator end() const { return array_ + size_; }
   bool operator==(const NativeArray& rhs) const {
-    return size() == rhs.size() &&
-        ArrayEq(begin(), size(), rhs.begin());
+    return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
   }
 
  private:
@@ -1092,8 +1152,6 @@ class NativeArray {
   const Element* array_;
   size_t size_;
   void (NativeArray::*clone_)(const Element*, size_t);
-
-  GTEST_DISALLOW_ASSIGN_(NativeArray);
 };
 
 // Backport of std::index_sequence.
@@ -1117,32 +1175,44 @@ struct DoubleSequence<false, IndexSequence<I...>, sizeofT> {
 // Backport of std::make_index_sequence.
 // It uses O(ln(N)) instantiation depth.
 template <size_t N>
-struct MakeIndexSequence
-    : DoubleSequence<N % 2 == 1, typename MakeIndexSequence<N / 2>::type,
+struct MakeIndexSequenceImpl
+    : DoubleSequence<N % 2 == 1, typename MakeIndexSequenceImpl<N / 2>::type,
                      N / 2>::type {};
 
 template <>
-struct MakeIndexSequence<0> : IndexSequence<> {};
+struct MakeIndexSequenceImpl<0> : IndexSequence<> {};
 
-// FIXME: This implementation of ElemFromList is O(1) in instantiation depth,
-// but it is O(N^2) in total instantiations. Not sure if this is the best
-// tradeoff, as it will make it somewhat slow to compile.
-template <typename T, size_t, size_t>
-struct ElemFromListImpl {};
+template <size_t N>
+using MakeIndexSequence = typename MakeIndexSequenceImpl<N>::type;
 
-template <typename T, size_t I>
-struct ElemFromListImpl<T, I, I> {
-  using type = T;
+template <typename... T>
+using IndexSequenceFor = typename MakeIndexSequence<sizeof...(T)>::type;
+
+template <size_t>
+struct Ignore {
+  Ignore(...);  // NOLINT
+};
+
+template <typename>
+struct ElemFromListImpl;
+template <size_t... I>
+struct ElemFromListImpl<IndexSequence<I...>> {
+  // We make Ignore a template to solve a problem with MSVC.
+  // A non-template Ignore would work fine with `decltype(Ignore(I))...`, but
+  // MSVC doesn't understand how to deal with that pack expansion.
+  // Use `0 * I` to have a single instantiation of Ignore.
+  template <typename R>
+  static R Apply(Ignore<0 * I>..., R (*)(), ...);
 };
 
-// Get the Nth element from T...
-// It uses O(1) instantiation depth.
-template <size_t N, typename I, typename... T>
-struct ElemFromList;
+template <size_t N, typename... T>
+struct ElemFromList {
+  using type =
+      decltype(ElemFromListImpl<typename MakeIndexSequence<N>::type>::Apply(
+          static_cast<T (*)()>(nullptr)...));
+};
 
-template <size_t N, size_t... I, typename... T>
-struct ElemFromList<N, IndexSequence<I...>, T...>
-    : ElemFromListImpl<T, N, I>... {};
+struct FlatTupleConstructTag {};
 
 template <typename... T>
 class FlatTuple;
@@ -1152,11 +1222,11 @@ struct FlatTupleElemBase;
 
 template <typename... T, size_t I>
 struct FlatTupleElemBase<FlatTuple<T...>, I> {
-  using value_type =
-      typename ElemFromList<I, typename MakeIndexSequence<sizeof...(T)>::type,
-                            T...>::type;
+  using value_type = typename ElemFromList<I, T...>::type;
   FlatTupleElemBase() = default;
-  explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {}
+  template <typename Arg>
+  explicit FlatTupleElemBase(FlatTupleConstructTag, Arg&& t)
+      : value(std::forward<Arg>(t)) {}
   value_type value;
 };
 
@@ -1168,13 +1238,35 @@ struct FlatTupleBase<FlatTuple<T...>, IndexSequence<Idx...>>
     : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
   using Indices = IndexSequence<Idx...>;
   FlatTupleBase() = default;
-  explicit FlatTupleBase(T... t)
-      : FlatTupleElemBase<FlatTuple<T...>, Idx>(std::move(t))... {}
+  template <typename... Args>
+  explicit FlatTupleBase(FlatTupleConstructTag, Args&&... args)
+      : FlatTupleElemBase<FlatTuple<T...>, Idx>(FlatTupleConstructTag{},
+                                                std::forward<Args>(args))... {}
+
+  template <size_t I>
+  const typename ElemFromList<I, T...>::type& Get() const {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <size_t I>
+  typename ElemFromList<I, T...>::type& Get() {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <typename F>
+  auto Apply(F&& f) -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
+
+  template <typename F>
+  auto Apply(F&& f) const -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
 };
 
 // Analog to std::tuple but with different tradeoffs.
 // This class minimizes the template instantiation depth, thus allowing more
-// elements that std::tuple would. std::tuple has been seen to require an
+// elements than std::tuple would. std::tuple has been seen to require an
 // instantiation depth of more than 10x the number of elements in some
 // implementations.
 // FlatTuple and ElemFromList are not recursive and have a fixed depth
@@ -1185,21 +1277,17 @@ template <typename... T>
 class FlatTuple
     : private FlatTupleBase<FlatTuple<T...>,
                             typename MakeIndexSequence<sizeof...(T)>::type> {
-  using Indices = typename FlatTuple::FlatTupleBase::Indices;
+  using Indices = typename FlatTupleBase<
+      FlatTuple<T...>, typename MakeIndexSequence<sizeof...(T)>::type>::Indices;
 
  public:
   FlatTuple() = default;
-  explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {}
-
-  template <size_t I>
-  const typename ElemFromList<I, Indices, T...>::type& Get() const {
-    return static_cast<const FlatTupleElemBase<FlatTuple, I>*>(this)->value;
-  }
+  template <typename... Args>
+  explicit FlatTuple(FlatTupleConstructTag tag, Args&&... args)
+      : FlatTuple::FlatTupleBase(tag, std::forward<Args>(args)...) {}
 
-  template <size_t I>
-  typename ElemFromList<I, Indices, T...>::type& Get() {
-    return static_cast<FlatTupleElemBase<FlatTuple, I>*>(this)->value;
-  }
+  using FlatTuple::FlatTupleBase::Apply;
+  using FlatTuple::FlatTupleBase::Get;
 };
 
 // Utility functions to be called with static_assert to induce deprecation
@@ -1232,9 +1320,25 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
 }  // namespace internal
 }  // namespace testing
 
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
-  ::testing::internal::AssertHelper(result_type, file, line, message) \
-    = ::testing::Message()
+namespace std {
+// Some standard library implementations use `struct tuple_size` and some use
+// `class tuple_size`. Clang warns about the mismatch.
+// https://reviews.llvm.org/D55466
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template <typename... Ts>
+struct tuple_size<testing::internal::FlatTuple<Ts...>>
+    : std::integral_constant<size_t, sizeof...(Ts)> {};
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+}  // namespace std
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type)             \
+  ::testing::internal::AssertHelper(result_type, file, line, message) = \
+      ::testing::Message()
 
 #define GTEST_MESSAGE_(message, result_type) \
   GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
@@ -1254,127 +1358,213 @@ constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
 // Suppress MSVC warning 4072 (unreachable code) for the code following
 // statement if it returns or throws (or doesn't return or throw in some
 // situations).
+// NOTE: The "else" is important to keep this expansion to prevent a top-level
+// "else" from attaching to our "if".
 #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
-  if (::testing::internal::AlwaysTrue()) { statement; }
-
-#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
-    bool gtest_caught_expected = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (expected_exception const&) { \
-      gtest_caught_expected = true; \
-    } \
-    catch (...) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws a different type."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-    if (!gtest_caught_expected) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws nothing."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
-      fail(gtest_msg.value)
-
-#define GTEST_TEST_NO_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
-      fail("Expected: " #statement " doesn't throw an exception.\n" \
-           "  Actual: it throws.")
-
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    bool gtest_caught_any = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      gtest_caught_any = true; \
-    } \
-    if (!gtest_caught_any) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
-      fail("Expected: " #statement " throws an exception.\n" \
-           "  Actual: it doesn't.")
+  if (::testing::internal::AlwaysTrue()) {                        \
+    statement;                                                    \
+  } else                     /* NOLINT */                         \
+    static_assert(true, "")  // User must have a semicolon after expansion.
+
+#if GTEST_HAS_EXCEPTIONS
+
+namespace testing {
+namespace internal {
+
+class NeverThrown {
+ public:
+  const char* what() const noexcept {
+    return "this exception should never be thrown";
+  }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#if GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) ::testing::internal::GetTypeName(typeid(e))
+
+#else  // GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) \
+  std::string { "an std::exception-derived error" }
 
+#endif  // GTEST_HAS_RTTI
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)   \
+  catch (typename std::conditional<                                            \
+         std::is_same<typename std::remove_cv<typename std::remove_reference<  \
+                          expected_exception>::type>::type,                    \
+                      std::exception>::value,                                  \
+         const ::testing::internal::NeverThrown&, const std::exception&>::type \
+             e) {                                                              \
+    gtest_msg.value = "Expected: " #statement                                  \
+                      " throws an exception of type " #expected_exception      \
+                      ".\n  Actual: it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                               \
+    gtest_msg.value += " with description \"";                                 \
+    gtest_msg.value += e.what();                                               \
+    gtest_msg.value += "\".";                                                  \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);                \
+  }
+
+#else  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail)              \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                             \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                    \
+    bool gtest_caught_expected = false;                                     \
+    try {                                                                   \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);            \
+    } catch (expected_exception const&) {                                   \
+      gtest_caught_expected = true;                                         \
+    }                                                                       \
+    GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)    \
+    catch (...) {                                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws a different type.";         \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+    if (!gtest_caught_expected) {                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws nothing.";                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+  } else /*NOLINT*/                                                         \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__)                   \
+        : fail(gtest_msg.value.c_str())
+
+#if GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                \
+  catch (std::exception const& e) {                               \
+    gtest_msg.value = "it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                  \
+    gtest_msg.value += " with description \"";                    \
+    gtest_msg.value += e.what();                                  \
+    gtest_msg.value += "\".";                                     \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+  }
+
+#else  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_(statement, fail)                            \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                          \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                 \
+    try {                                                                \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);         \
+    }                                                                    \
+    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
+    catch (...) {                                                        \
+      gtest_msg.value = "it throws.";                                    \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);      \
+    }                                                                    \
+  } else                                                                 \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__)              \
+        : fail(("Expected: " #statement " doesn't throw an exception.\n" \
+                "  Actual: " +                                           \
+                gtest_msg.value)                                         \
+                   .c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail)                       \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                      \
+  if (::testing::internal::AlwaysTrue()) {                           \
+    bool gtest_caught_any = false;                                   \
+    try {                                                            \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);     \
+    } catch (...) {                                                  \
+      gtest_caught_any = true;                                       \
+    }                                                                \
+    if (!gtest_caught_any) {                                         \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    }                                                                \
+  } else                                                             \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__)         \
+        : fail("Expected: " #statement                               \
+               " throws an exception.\n"                             \
+               "  Actual: it doesn't.")
 
 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
+// representation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar_ = \
-      ::testing::AssertionResult(expression)) \
-    ; \
-  else \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(\
-        gtest_ar_, text, #actual, #expected).c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                       \
+  if (const ::testing::AssertionResult gtest_ar_ =                    \
+          ::testing::AssertionResult(expression))                     \
+    ;                                                                 \
+  else                                                                \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(         \
+             gtest_ar_, text, #actual, #expected)                     \
+             .c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail)                          \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
     ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
-      fail("Expected: " #statement " doesn't generate new fatal " \
-           "failures in the current thread.\n" \
-           "  Actual: it does.")
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) {                 \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__);            \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__)                    \
+        : fail("Expected: " #statement                                         \
+               " doesn't generate new fatal "                                  \
+               "failures in the current thread.\n"                             \
+               "  Actual: it does.")
 
 // Expands to the name of the class that implements the given test.
 #define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
   test_suite_name##_##test_name##_Test
 
 // Helper macro for defining tests.
-#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)      \
-  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                \
-                "test_suite_name must not be empty");                         \
-  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                      \
-                "test_name must not be empty");                               \
-  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
-      : public parent_class {                                                 \
-   public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                   \
-                                                                              \
-   private:                                                                   \
-    virtual void TestBody();                                                  \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
-                                                           test_name));       \
-  };                                                                          \
-                                                                              \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
-                                                    test_name)::test_info_ =  \
-      ::testing::internal::MakeAndRegisterTestInfo(                           \
-          #test_suite_name, #test_name, nullptr, nullptr,                     \
-          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),         \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),      \
-          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
-              test_suite_name, test_name)>);                                  \
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)       \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                 \
+                "test_suite_name must not be empty");                          \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                       \
+                "test_name must not be empty");                                \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
+      : public parent_class {                                                  \
+   public:                                                                     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;            \
+    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default;  \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        GTEST_TEST_CLASS_NAME_(test_suite_name,                                \
+                               test_name) &&) noexcept = delete; /* NOLINT */  \
+                                                                               \
+   private:                                                                    \
+    void TestBody() override;                                                  \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;      \
+  };                                                                           \
+                                                                               \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,           \
+                                                    test_name)::test_info_ =   \
+      ::testing::internal::MakeAndRegisterTestInfo(                            \
+          #test_suite_name, #test_name, nullptr, nullptr,                      \
+          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id),  \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),          \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),       \
+          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(     \
+              test_suite_name, test_name)>);                                   \
   void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
index 97533993c0..e7af2f904a 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
@@ -27,13 +27,14 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Type and function utilities for implementing parameterized tests.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 
 #include <ctype.h>
 
@@ -42,21 +43,22 @@
 #include <memory>
 #include <set>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest-test-part.h"
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-port.h"
-#include "gtest/gtest-printers.h"
 
 namespace testing {
 // Input to a parameterized test name generator, describing a test parameter.
 // Consists of the parameter value and the integer parameter index.
 template <class ParamType>
 struct TestParamInfo {
-  TestParamInfo(const ParamType& a_param, size_t an_index) :
-    param(a_param),
-    index(an_index) {}
+  TestParamInfo(const ParamType& a_param, size_t an_index)
+      : param(a_param), index(an_index) {}
   ParamType param;
   size_t index;
 };
@@ -82,8 +84,10 @@ namespace internal {
 GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
                                            CodeLocation code_location);
 
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
+template <typename>
+class ParamGeneratorInterface;
+template <typename>
+class ParamGenerator;
 
 // Interface for iterating over elements provided by an implementation
 // of ParamGeneratorInterface<T>.
@@ -127,8 +131,7 @@ class ParamIterator {
   // ParamIterator assumes ownership of the impl_ pointer.
   ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
   ParamIterator& operator=(const ParamIterator& other) {
-    if (this != &other)
-      impl_.reset(other.impl_->Clone());
+    if (this != &other) impl_.reset(other.impl_->Clone());
     return *this;
   }
 
@@ -155,7 +158,7 @@ class ParamIterator {
  private:
   friend class ParamGenerator<T>;
   explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
-  std::unique_ptr<ParamIteratorInterface<T> > impl_;
+  std::unique_ptr<ParamIteratorInterface<T>> impl_;
 };
 
 // ParamGeneratorInterface<T> is the binary interface to access generators
@@ -177,7 +180,7 @@ class ParamGeneratorInterface {
 // This class implements copy initialization semantics and the contained
 // ParamGeneratorInterface<T> instance is shared among all copies
 // of the original object. This is possible because that instance is immutable.
-template<typename T>
+template <typename T>
 class ParamGenerator {
  public:
   typedef ParamIterator<T> iterator;
@@ -194,7 +197,7 @@ class ParamGenerator {
   iterator end() const { return iterator(impl_->End()); }
 
  private:
-  std::shared_ptr<const ParamGeneratorInterface<T> > impl_;
+  std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
 };
 
 // Generates values from a range of two comparable values. Can be used to
@@ -205,8 +208,10 @@ template <typename T, typename IncrementT>
 class RangeGenerator : public ParamGeneratorInterface<T> {
  public:
   RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end),
-        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+      : begin_(begin),
+        end_(end),
+        step_(step),
+        end_index_(CalculateEndIndex(begin, end, step)) {}
   ~RangeGenerator() override {}
 
   ParamIteratorInterface<T>* Begin() const override {
@@ -249,7 +254,9 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
    private:
     Iterator(const Iterator& other)
         : ParamIteratorInterface<T>(),
-          base_(other.base_), value_(other.value_), index_(other.index_),
+          base_(other.base_),
+          value_(other.value_),
+          index_(other.index_),
           step_(other.step_) {}
 
     // No implementation - assignment is unsupported.
@@ -261,12 +268,10 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
     const IncrementT step_;
   };  // class RangeGenerator::Iterator
 
-  static int CalculateEndIndex(const T& begin,
-                               const T& end,
+  static int CalculateEndIndex(const T& begin, const T& end,
                                const IncrementT& step) {
     int end_index = 0;
-    for (T i = begin; i < end; i = static_cast<T>(i + step))
-      end_index++;
+    for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
     return end_index;
   }
 
@@ -281,7 +286,6 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
   const int end_index_;
 };  // class RangeGenerator
 
-
 // Generates values from a pair of STL-style iterators. Used in the
 // ValuesIn() function. The elements are copied from the source range
 // since the source can be located on the stack, and the generator
@@ -339,13 +343,13 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
           << "The program attempted to compare iterators "
           << "from different generators." << std::endl;
       return iterator_ ==
-          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+             CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
     }
 
    private:
     Iterator(const Iterator& other)
-          // The explicit constructor call suppresses a false warning
-          // emitted by gcc when supplied with the -Wextra option.
+        // The explicit constructor call suppresses a false warning
+        // emitted by gcc when supplied with the -Wextra option.
         : ParamIteratorInterface<T>(),
           base_(other.base_),
           iterator_(other.iterator_) {}
@@ -392,8 +396,8 @@ template <class TestClass>
 class ParameterizedTestFactory : public TestFactoryBase {
  public:
   typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter) :
-      parameter_(parameter) {}
+  explicit ParameterizedTestFactory(ParamType parameter)
+      : parameter_(parameter) {}
   Test* CreateTest() override {
     TestClass::SetParam(&parameter_);
     return new TestClass();
@@ -402,7 +406,8 @@ class ParameterizedTestFactory : public TestFactoryBase {
  private:
   const ParamType parameter_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+  ParameterizedTestFactory(const ParameterizedTestFactory&) = delete;
+  ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -438,7 +443,8 @@ class TestMetaFactory
   }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+  TestMetaFactory(const TestMetaFactory&) = delete;
+  TestMetaFactory& operator=(const TestMetaFactory&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -457,7 +463,7 @@ class ParameterizedTestSuiteInfoBase {
 
   // Base part of test suite name for display purposes.
   virtual const std::string& GetTestSuiteName() const = 0;
-  // Test case id to verify identity.
+  // Test suite id to verify identity.
   virtual TypeId GetTestSuiteTypeId() const = 0;
   // UnitTest class invokes this method to register tests in this
   // test suite right before running them in RUN_ALL_TESTS macro.
@@ -469,9 +475,23 @@ class ParameterizedTestSuiteInfoBase {
   ParameterizedTestSuiteInfoBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase);
+  ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) =
+      delete;
+  ParameterizedTestSuiteInfoBase& operator=(
+      const ParameterizedTestSuiteInfoBase&) = delete;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Report a the name of a test_suit as safe to ignore
+// as the side effect of construction of this type.
+struct GTEST_API_ MarkAsIgnored {
+  explicit MarkAsIgnored(const char* test_suite);
 };
 
+GTEST_API_ void InsertSyntheticTestCase(const std::string& name,
+                                        CodeLocation location, bool has_test_p);
+
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P
@@ -494,11 +514,11 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
                                       CodeLocation code_location)
       : test_suite_name_(name), code_location_(code_location) {}
 
-  // Test case base name for display purposes.
+  // Test suite base name for display purposes.
   const std::string& GetTestSuiteName() const override {
     return test_suite_name_;
   }
-  // Test case id to verify identity.
+  // Test suite id to verify identity.
   TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
   // TEST_P macro uses AddTestPattern() to record information
   // about a single test in a LocalTestInfo structure.
@@ -507,9 +527,10 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
   // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
   // test suite base name and DoBar is test base name.
   void AddTestPattern(const char* test_suite_name, const char* test_base_name,
-                      TestMetaFactoryBase<ParamType>* meta_factory) {
-    tests_.push_back(std::shared_ptr<TestInfo>(
-        new TestInfo(test_suite_name, test_base_name, meta_factory)));
+                      TestMetaFactoryBase<ParamType>* meta_factory,
+                      CodeLocation code_location) {
+    tests_.push_back(std::shared_ptr<TestInfo>(new TestInfo(
+        test_suite_name, test_base_name, meta_factory, code_location)));
   }
   // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
   // about a generator.
@@ -522,17 +543,19 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
     return 0;  // Return value used only to run this method in namespace scope.
   }
   // UnitTest class invokes this method to register tests in this test suite
-  // test suites right before running tests in RUN_ALL_TESTS macro.
+  // right before running tests in RUN_ALL_TESTS macro.
   // This method should not be called more than once on any single
   // instance of a ParameterizedTestSuiteInfoBase derived class.
   // UnitTest has a guard to prevent from calling this method more than once.
   void RegisterTests() override {
+    bool generated_instantiations = false;
+
     for (typename TestInfoContainer::iterator test_it = tests_.begin();
          test_it != tests_.end(); ++test_it) {
       std::shared_ptr<TestInfo> test_info = *test_it;
       for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin(); gen_it != instantiations_.end();
-               ++gen_it) {
+               instantiations_.begin();
+           gen_it != instantiations_.end(); ++gen_it) {
         const std::string& instantiation_name = gen_it->name;
         ParamGenerator<ParamType> generator((*gen_it->generator)());
         ParamNameGeneratorFunc* name_func = gen_it->name_func;
@@ -540,7 +563,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
         int line = gen_it->line;
 
         std::string test_suite_name;
-        if ( !instantiation_name.empty() )
+        if (!instantiation_name.empty())
           test_suite_name = instantiation_name + "/";
         test_suite_name += test_info->test_suite_base_name;
 
@@ -549,19 +572,20 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
         for (typename ParamGenerator<ParamType>::iterator param_it =
                  generator.begin();
              param_it != generator.end(); ++param_it, ++i) {
+          generated_instantiations = true;
+
           Message test_name_stream;
 
-          std::string param_name = name_func(
-              TestParamInfo<ParamType>(*param_it, i));
+          std::string param_name =
+              name_func(TestParamInfo<ParamType>(*param_it, i));
 
           GTEST_CHECK_(IsValidParamName(param_name))
               << "Parameterized test name '" << param_name
-              << "' is invalid, in " << file
-              << " line " << line << std::endl;
+              << "' is invalid, in " << file << " line " << line << std::endl;
 
           GTEST_CHECK_(test_param_names.count(param_name) == 0)
-              << "Duplicate parameterized test name '" << param_name
-              << "', in " << file << " line " << line << std::endl;
+              << "Duplicate parameterized test name '" << param_name << "', in "
+              << file << " line " << line << std::endl;
 
           test_param_names.insert(param_name);
 
@@ -572,63 +596,69 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
           MakeAndRegisterTestInfo(
               test_suite_name.c_str(), test_name_stream.GetString().c_str(),
               nullptr,  // No type parameter.
-              PrintToString(*param_it).c_str(), code_location_,
+              PrintToString(*param_it).c_str(), test_info->code_location,
               GetTestSuiteTypeId(),
               SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
               SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
               test_info->test_meta_factory->CreateTestFactory(*param_it));
         }  // for param_it
-      }  // for gen_it
-    }  // for test_it
-  }    // RegisterTests
+      }    // for gen_it
+    }      // for test_it
+
+    if (!generated_instantiations) {
+      // There are no generaotrs, or they all generate nothing ...
+      InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
+                              !tests_.empty());
+    }
+  }  // RegisterTests
 
  private:
   // LocalTestInfo structure keeps information about a single test registered
   // with TEST_P macro.
   struct TestInfo {
     TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name,
-             TestMetaFactoryBase<ParamType>* a_test_meta_factory)
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory,
+             CodeLocation a_code_location)
         : test_suite_base_name(a_test_suite_base_name),
           test_base_name(a_test_base_name),
-          test_meta_factory(a_test_meta_factory) {}
+          test_meta_factory(a_test_meta_factory),
+          code_location(a_code_location) {}
 
     const std::string test_suite_base_name;
     const std::string test_base_name;
-    const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+    const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
+    const CodeLocation code_location;
   };
-  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
+  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
   // Records data received from INSTANTIATE_TEST_SUITE_P macros:
   //  <Instantiation name, Sequence generator creation function,
   //     Name generator function, Source file, Source line>
   struct InstantiationInfo {
-      InstantiationInfo(const std::string &name_in,
-                        GeneratorCreationFunc* generator_in,
-                        ParamNameGeneratorFunc* name_func_in,
-                        const char* file_in,
-                        int line_in)
-          : name(name_in),
-            generator(generator_in),
-            name_func(name_func_in),
-            file(file_in),
-            line(line_in) {}
-
-      std::string name;
-      GeneratorCreationFunc* generator;
-      ParamNameGeneratorFunc* name_func;
-      const char* file;
-      int line;
+    InstantiationInfo(const std::string& name_in,
+                      GeneratorCreationFunc* generator_in,
+                      ParamNameGeneratorFunc* name_func_in, const char* file_in,
+                      int line_in)
+        : name(name_in),
+          generator(generator_in),
+          name_func(name_func_in),
+          file(file_in),
+          line(line_in) {}
+
+    std::string name;
+    GeneratorCreationFunc* generator;
+    ParamNameGeneratorFunc* name_func;
+    const char* file;
+    int line;
   };
   typedef ::std::vector<InstantiationInfo> InstantiationContainer;
 
   static bool IsValidParamName(const std::string& name) {
     // Check for empty string
-    if (name.empty())
-      return false;
+    if (name.empty()) return false;
 
     // Check for invalid characters
     for (std::string::size_type index = 0; index < name.size(); ++index) {
-      if (!isalnum(name[index]) && name[index] != '_')
-        return false;
+      if (!IsAlNum(name[index]) && name[index] != '_') return false;
     }
 
     return true;
@@ -639,7 +669,9 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
   TestInfoContainer tests_;
   InstantiationContainer instantiations_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo);
+  ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete;
+  ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) =
+      delete;
 };  // class ParameterizedTestSuiteInfo
 
 //  Legacy API is deprecated but still available
@@ -682,7 +714,7 @@ class ParameterizedTestSuiteRegistry {
           // type we are looking for, so we downcast it to that type
           // without further checks.
           typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestSuiteInfo<TestSuite> >(test_suite_info);
+              ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
         }
         break;
       }
@@ -714,7 +746,38 @@ class ParameterizedTestSuiteRegistry {
 
   TestSuiteInfoContainer test_suite_infos_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry);
+  ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) =
+      delete;
+  ParameterizedTestSuiteRegistry& operator=(
+      const ParameterizedTestSuiteRegistry&) = delete;
+};
+
+// Keep track of what type-parameterized test suite are defined and
+// where as well as which are intatiated. This allows susequently
+// identifying suits that are defined but never used.
+class TypeParameterizedTestSuiteRegistry {
+ public:
+  // Add a suite definition
+  void RegisterTestSuite(const char* test_suite_name,
+                         CodeLocation code_location);
+
+  // Add an instantiation of a suit.
+  void RegisterInstantiation(const char* test_suite_name);
+
+  // For each suit repored as defined but not reported as instantiation,
+  // emit a test that reports that fact (configurably, as an error).
+  void CheckForInstantiations();
+
+ private:
+  struct TypeParameterizedTestSuiteInfo {
+    explicit TypeParameterizedTestSuiteInfo(CodeLocation c)
+        : code_location(c), instantiated(false) {}
+
+    CodeLocation code_location;
+    bool instantiated;
+  };
+
+  std::map<std::string, TypeParameterizedTestSuiteInfo> suites_;
 };
 
 }  // namespace internal
@@ -728,10 +791,15 @@ internal::ParamGenerator<typename Container::value_type> ValuesIn(
 namespace internal {
 // Used in the Values() function to provide polymorphic capabilities.
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
 template <typename... Ts>
 class ValueArray {
  public:
-  ValueArray(Ts... v) : v_{std::move(v)...} {}
+  explicit ValueArray(Ts... v) : v_(FlatTupleConstructTag{}, std::move(v)...) {}
 
   template <typename T>
   operator ParamGenerator<T>() const {  // NOLINT
@@ -747,6 +815,10 @@ class ValueArray {
   FlatTuple<Ts...> v_;
 };
 
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
 template <typename... T>
 class CartesianProductGenerator
     : public ParamGeneratorInterface<::std::tuple<T...>> {
@@ -772,7 +844,8 @@ class CartesianProductGenerator
       : public ParamIteratorInterface<ParamType> {
    public:
     IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
-             const std::tuple<ParamGenerator<T>...>& generators, bool is_end)
+                 const std::tuple<ParamGenerator<T>...>& generators,
+                 bool is_end)
         : base_(base),
           begin_(std::get<I>(generators).begin()...),
           end_(std::get<I>(generators).end()...),
@@ -880,4 +953,4 @@ class CartesianProductHolder {
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
index cece93dba1..f025db76ad 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
@@ -26,82 +26,91 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the GTEST_OS_* macro.
 // It is separate from gtest-port.h so that custom/gtest-port.h can include it.
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
-#  define GTEST_OS_WINDOWS_MINGW 1
-#  define GTEST_OS_WINDOWS 1
+#define GTEST_OS_CYGWIN 1
+#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#define GTEST_OS_WINDOWS_MINGW 1
+#define GTEST_OS_WINDOWS 1
 #elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(WINAPI_FAMILY)
-#  include <winapifamily.h>
-#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
-#   define GTEST_OS_WINDOWS_RT 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#   define GTEST_OS_WINDOWS_TV_TITLE 1
-#  else
-    // WINAPI_FAMILY defined but no known partition matched.
-    // Default to desktop.
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  endif
-# else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
+#define GTEST_OS_WINDOWS 1
+#ifdef _WIN32_WCE
+#define GTEST_OS_WINDOWS_MOBILE 1
+#elif defined(WINAPI_FAMILY)
+#include <winapifamily.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#define GTEST_OS_WINDOWS_PHONE 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define GTEST_OS_WINDOWS_RT 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#define GTEST_OS_WINDOWS_PHONE 1
+#define GTEST_OS_WINDOWS_TV_TITLE 1
+#else
+// WINAPI_FAMILY defined but no known partition matched.
+// Default to desktop.
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif
+#else
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif  // _WIN32_WCE
 #elif defined __OS2__
-# define GTEST_OS_OS2 1
+#define GTEST_OS_OS2 1
 #elif defined __APPLE__
-# define GTEST_OS_MAC 1
-# if TARGET_OS_IPHONE
-#  define GTEST_OS_IOS 1
-# endif
+#define GTEST_OS_MAC 1
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#define GTEST_OS_IOS 1
+#endif
 #elif defined __DragonFly__
-# define GTEST_OS_DRAGONFLY 1
+#define GTEST_OS_DRAGONFLY 1
 #elif defined __FreeBSD__
-# define GTEST_OS_FREEBSD 1
+#define GTEST_OS_FREEBSD 1
 #elif defined __Fuchsia__
-# define GTEST_OS_FUCHSIA 1
+#define GTEST_OS_FUCHSIA 1
+#elif defined(__GNU__)
+#define GTEST_OS_GNU_HURD 1
 #elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
-# define GTEST_OS_GNU_KFREEBSD 1
+#define GTEST_OS_GNU_KFREEBSD 1
 #elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined __ANDROID__
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif
+#define GTEST_OS_LINUX 1
+#if defined __ANDROID__
+#define GTEST_OS_LINUX_ANDROID 1
+#endif
 #elif defined __MVS__
-# define GTEST_OS_ZOS 1
+#define GTEST_OS_ZOS 1
 #elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
+#define GTEST_OS_SOLARIS 1
 #elif defined(_AIX)
-# define GTEST_OS_AIX 1
+#define GTEST_OS_AIX 1
 #elif defined(__hpux)
-# define GTEST_OS_HPUX 1
+#define GTEST_OS_HPUX 1
 #elif defined __native_client__
-# define GTEST_OS_NACL 1
+#define GTEST_OS_NACL 1
 #elif defined __NetBSD__
-# define GTEST_OS_NETBSD 1
+#define GTEST_OS_NETBSD 1
 #elif defined __OpenBSD__
-# define GTEST_OS_OPENBSD 1
+#define GTEST_OS_OPENBSD 1
 #elif defined __QNX__
-# define GTEST_OS_QNX 1
+#define GTEST_OS_QNX 1
 #elif defined(__HAIKU__)
 #define GTEST_OS_HAIKU 1
+#elif defined ESP8266
+#define GTEST_OS_ESP8266 1
+#elif defined ESP32
+#define GTEST_OS_ESP32 1
+#elif defined(__XTENSA__)
+#define GTEST_OS_XTENSA 1
 #endif  // __CYGWIN__
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-port.h b/third_party/googletest/src/include/gtest/internal/gtest-port.h
index 063fcb1083..0003d27658 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-port.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Low-level types and utilities for porting Google Test to various
 // platforms.  All macros ending with _ and symbols defined in an
 // internal namespace are subject to change without notice.  Code
@@ -38,10 +38,12 @@
 // files are expected to #include this.  Therefore, it cannot #include
 // any other Google Test header.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
 // Environment-describing macros
 // -----------------------------
@@ -116,6 +118,7 @@
 //   GTEST_OS_DRAGONFLY - DragonFlyBSD
 //   GTEST_OS_FREEBSD  - FreeBSD
 //   GTEST_OS_FUCHSIA  - Fuchsia
+//   GTEST_OS_GNU_HURD - GNU/Hurd
 //   GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
 //   GTEST_OS_HAIKU    - Haiku
 //   GTEST_OS_HPUX     - HP-UX
@@ -167,7 +170,7 @@
 //   GTEST_HAS_TYPED_TEST   - typed tests
 //   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
 //   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
-//   GOOGLETEST_CM0007 DO NOT DELETE
+//   GTEST_USES_RE2         - the RE2 regular expression library is used
 //   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
 //                            GTEST_HAS_POSIX_RE (see above) which users can
 //                            define themselves.
@@ -190,24 +193,36 @@
 //   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
 //   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
 //                              variable don't have to be used.
-//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
-//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
 //   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
 //   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
 //                                        suppressed (constant conditional).
 //   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
 //                                        is suppressed.
+//   GTEST_INTERNAL_HAS_ANY - for enabling UniversalPrinter<std::any> or
+//                            UniversalPrinter<absl::any> specializations.
+//   GTEST_INTERNAL_HAS_OPTIONAL - for enabling UniversalPrinter<std::optional>
+//   or
+//                                 UniversalPrinter<absl::optional>
+//                                 specializations.
+//   GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher<std::string_view> or
+//                                    Matcher<absl::string_view>
+//                                    specializations.
+//   GTEST_INTERNAL_HAS_VARIANT - for enabling UniversalPrinter<std::variant> or
+//                                UniversalPrinter<absl::variant>
+//                                specializations.
 //
 // Synchronization:
 //   Mutex, MutexLock, ThreadLocal, GetThreadCount()
 //                            - synchronization primitives.
 //
 // Regular expressions:
-//   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like platforms
-//                    GOOGLETEST_CM0008 DO NOT DELETE
-//                    or a reduced regular exception syntax on other
-//                    platforms, including Windows.
+//   RE             - a simple regular expression class using
+//                     1) the RE2 syntax on all platforms when built with RE2
+//                        and Abseil as dependencies
+//                     2) the POSIX Extended Regular Expression syntax on
+//                        UNIX-like platforms,
+//                     3) A reduced regular exception syntax on other platforms,
+//                        including Windows.
 // Logging:
 //   GTEST_LOG_()   - logs messages at the specified severity level.
 //   LogToStderr()  - directs all log messages to stderr.
@@ -223,19 +238,16 @@
 //
 // Integer types:
 //   TypeWithSize   - maps an integer to a int type.
-//   Int32, UInt32, Int64, UInt64, TimeInMillis
-//                  - integers of known sizes.
+//   TimeInMillis   - integers of known sizes.
 //   BiggestInt     - the biggest signed integer type.
 //
 // Command-line utilities:
-//   GTEST_DECLARE_*()  - declares a flag.
-//   GTEST_DEFINE_*()   - defines a flag.
 //   GetInjectableArgvs() - returns the command line as a vector of strings.
 //
 // Environment variable utilities:
 //   GetEnv()             - gets the value of an environment variable.
 //   BoolFromGTestEnv()   - parses a bool environment variable.
-//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   Int32FromGTestEnv()  - parses an int32_t environment variable.
 //   StringFromGTestEnv() - parses a string environment variable.
 //
 // Deprecation warnings:
@@ -248,48 +260,57 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include <cerrno>
+// #include <condition_variable>  // Guarded by GTEST_IS_THREADSAFE below
+#include <cstdint>
+#include <iostream>
+#include <limits>
+#include <locale>
 #include <memory>
+#include <string>
+// #include <mutex>  // Guarded by GTEST_IS_THREADSAFE below
+#include <tuple>
 #include <type_traits>
+#include <vector>
 
 #ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #endif  // !_WIN32_WCE
 
 #if defined __APPLE__
-# include <AvailabilityMacros.h>
-# include <TargetConditionals.h>
+#include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
 #endif
 
-#include <algorithm>  // NOLINT
-#include <iostream>   // NOLINT
-#include <sstream>    // NOLINT
-#include <string>     // NOLINT
-#include <tuple>
-#include <utility>
-#include <vector>  // NOLINT
-
-#include "gtest/internal/gtest-port-arch.h"
 #include "gtest/internal/custom/gtest-port.h"
+#include "gtest/internal/gtest-port-arch.h"
+
+#if GTEST_HAS_ABSL
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#include "absl/flags/reflection.h"
+#endif
 
 #if !defined(GTEST_DEV_EMAIL_)
-# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-# define GTEST_FLAG_PREFIX_ "gtest_"
-# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-# define GTEST_NAME_ "Google Test"
-# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
 #endif  // !defined(GTEST_DEV_EMAIL_)
 
 #if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
-# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
 #endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
 
 // Determines the version of gcc that is used to compile this.
 #ifdef __GNUC__
 // 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#define GTEST_GCC_VER_ \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
 #endif  // __GNUC__
 
 // Macros for disabling Microsoft Visual C++ warnings.
@@ -298,41 +319,37 @@
 //   /* code that triggers warnings C4800 and C4385 */
 //   GTEST_DISABLE_MSC_WARNINGS_POP_()
 #if defined(_MSC_VER)
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
-    __pragma(warning(push))                        \
-    __pragma(warning(disable: warnings))
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
-    __pragma(warning(pop))
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+  __pragma(warning(push)) __pragma(warning(disable : warnings))
+#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
 #else
 // Not all compilers are MSVC
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+#define GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Clang on Windows does not understand MSVC's pragma warning.
 // We need clang-specific way to disable function deprecation warning.
 #ifdef __clang__
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
-    _Pragma("clang diagnostic push")                                  \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
-#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    _Pragma("clang diagnostic pop")
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                            \
+  _Pragma("clang diagnostic push")                                      \
+      _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+          _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
 #else
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
-# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Brings in definitions for functions used in the testing::internal::posix
 // namespace (read, write, close, chdir, isatty, stat). We do not currently
 // use them on Windows Mobile.
 #if GTEST_OS_WINDOWS
-# if !GTEST_OS_WINDOWS_MOBILE
-#  include <direct.h>
-#  include <io.h>
-# endif
+#if !GTEST_OS_WINDOWS_MOBILE
+#include <direct.h>
+#include <io.h>
+#endif
 // In order to avoid having to include <windows.h>, use forward declaration
 #if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
 // MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
@@ -344,72 +361,63 @@ typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
 typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #endif
+#elif GTEST_OS_XTENSA
+#include <unistd.h>
+// Xtensa toolchains define strcasecmp in the string.h header instead of
+// strings.h. string.h is already included.
 #else
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
 // mentioned above.
-# include <unistd.h>
-# include <strings.h>
+#include <strings.h>
+#include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_LINUX_ANDROID
 // Used to define __ANDROID_API__ matching the target NDK API level.
-#  include <android/api-level.h>  // NOLINT
+#include <android/api-level.h>  // NOLINT
 #endif
 
 // Defines this to true if and only if Google Test can use POSIX regular
 // expressions.
 #ifndef GTEST_HAS_POSIX_RE
-# if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
 // On Android, <regex.h> is only available starting with Gingerbread.
-#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-# else
-#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
-# endif
+#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+#else
+#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
+#endif
 #endif
 
-#if GTEST_USES_PCRE
-// The appropriate headers have already been included.
-
+// Select the regular expression implementation.
+#if GTEST_HAS_ABSL
+// When using Abseil, RE2 is required.
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+#define GTEST_USES_RE2 1
 #elif GTEST_HAS_POSIX_RE
-
-// On some platforms, <regex.h> needs someone to define size_t, and
-// won't compile otherwise.  We can #include it here as we already
-// included <stdlib.h>, which is guaranteed to define size_t through
-// <stddef.h>.
-# include <regex.h>  // NOLINT
-
-# define GTEST_USES_POSIX_RE 1
-
-#elif GTEST_OS_WINDOWS
-
-// <regex.h> is not available on Windows.  Use our own simple regex
-// implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
+#include <regex.h>  // NOLINT
+#define GTEST_USES_POSIX_RE 1
 #else
-
-// <regex.h> may not be available on this platform.  Use our own
-// simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#endif  // GTEST_USES_PCRE
+// Use our own simple regex implementation.
+#define GTEST_USES_SIMPLE_RE 1
+#endif
 
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
 // to figure it out.
-# if defined(_MSC_VER) && defined(_CPPUNWIND)
+#if defined(_MSC_VER) && defined(_CPPUNWIND)
 // MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__BORLANDC__)
 // C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
 // macro to enable exceptions, so we'll do the same.
 // Assumes that exceptions are enabled by default.
-#  ifndef _HAS_EXCEPTIONS
-#   define _HAS_EXCEPTIONS 1
-#  endif  // _HAS_EXCEPTIONS
-#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__clang__)
+#ifndef _HAS_EXCEPTIONS
+#define _HAS_EXCEPTIONS 1
+#endif  // _HAS_EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+#elif defined(__clang__)
 // clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
 // 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
 // there can be cleanups for ObjC exceptions which also need cleanups, even if
@@ -418,38 +426,29 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // cleanups prior to that. To reliably check for C++ exception availability with
 // clang, check for
 // __EXCEPTIONS && __has_feature(cxx_exceptions).
-#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
-# elif defined(__GNUC__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+#elif defined(__GNUC__) && __EXCEPTIONS
 // gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__SUNPRO_CC)
 // Sun Pro CC supports exceptions.  However, there is no compile-time way of
 // detecting whether they are enabled or not.  Therefore, we assume that
 // they are enabled unless the user tells us otherwise.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__IBMCPP__) && __EXCEPTIONS
 // xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__HP_aCC)
 // Exception handling is in effect by default in HP aCC compiler. It has to
 // be turned of by +noeh compiler option if desired.
-#  define GTEST_HAS_EXCEPTIONS 1
-# else
+#define GTEST_HAS_EXCEPTIONS 1
+#else
 // For other compilers, we assume exceptions are disabled to be
 // conservative.
-#  define GTEST_HAS_EXCEPTIONS 0
-# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 0
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
 #endif  // GTEST_HAS_EXCEPTIONS
 
-#if !defined(GTEST_HAS_STD_STRING)
-// Even though we don't use this macro any longer, we keep it in case
-// some clients still depend on it.
-# define GTEST_HAS_STD_STRING 1
-#elif !GTEST_HAS_STD_STRING
-// The user told us that ::std::string isn't available.
-# error "::std::string isn't available."
-#endif  // !defined(GTEST_HAS_STD_STRING)
-
 #ifndef GTEST_HAS_STD_WSTRING
 // The user didn't tell us whether ::std::wstring is available, so we need
 // to figure it out.
@@ -458,7 +457,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // no support for it at least as recent as Froyo (2.2).
 #define GTEST_HAS_STD_WSTRING                                         \
   (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     GTEST_OS_HAIKU))
+     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266 || GTEST_OS_XTENSA))
 
 #endif  // GTEST_HAS_STD_WSTRING
 
@@ -467,63 +466,62 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // The user didn't tell us whether RTTI is enabled, so we need to
 // figure it out.
 
-# ifdef _MSC_VER
+#ifdef _MSC_VER
 
 #ifdef _CPPRTTI  // MSVC defines this macro if and only if RTTI is enabled.
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
 // Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
 // enabled.
-# elif defined(__GNUC__)
+#elif defined(__GNUC__)
 
-#  ifdef __GXX_RTTI
+#ifdef __GXX_RTTI
 // When building against STLport with the Android NDK and with
 // -frtti -fno-exceptions, the build fails at link time with undefined
 // references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
 // so disable RTTI when detected.
-#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
-       !defined(__EXCEPTIONS)
-#    define GTEST_HAS_RTTI 0
-#   else
-#    define GTEST_HAS_RTTI 1
-#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif  // __GXX_RTTI
+#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
+#define GTEST_HAS_RTTI 0
+#else
+#define GTEST_HAS_RTTI 1
+#endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#else
+#define GTEST_HAS_RTTI 0
+#endif  // __GXX_RTTI
 
 // Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
 // using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
 // first version with C++ support.
-# elif defined(__clang__)
+#elif defined(__clang__)
 
-#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
 
 // Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
 // both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
 
-#  ifdef __RTTI_ALL__
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#ifdef __RTTI_ALL__
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
-# else
+#else
 
 // For all other compilers, we assume RTTI is enabled.
-#  define GTEST_HAS_RTTI 1
+#define GTEST_HAS_RTTI 1
 
-# endif  // _MSC_VER
+#endif  // _MSC_VER
 
 #endif  // GTEST_HAS_RTTI
 
 // It's this header's responsibility to #include <typeinfo> when RTTI
 // is enabled.
 #if GTEST_HAS_RTTI
-# include <typeinfo>
+#include <typeinfo>
 #endif
 
 // Determines whether Google Test can use the pthreads library.
@@ -537,16 +535,16 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
   (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX ||          \
    GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
    GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD ||          \
-   GTEST_OS_HAIKU)
+   GTEST_OS_HAIKU || GTEST_OS_GNU_HURD)
 #endif  // GTEST_HAS_PTHREAD
 
 #if GTEST_HAS_PTHREAD
 // gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
 // true.
-# include <pthread.h>  // NOLINT
+#include <pthread.h>  // NOLINT
 
 // For timespec and nanosleep, used below.
-# include <time.h>  // NOLINT
+#include <time.h>  // NOLINT
 #endif
 
 // Determines whether clone(2) is supported.
@@ -556,24 +554,23 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_CLONE
 // The user didn't tell us, so we need to figure it out.
 
-# if GTEST_OS_LINUX && !defined(__ia64__)
-#  if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX_ANDROID
 // On Android, clone() became available at different API levels for each 32-bit
 // architecture.
-#    if defined(__LP64__) || \
-        (defined(__arm__) && __ANDROID_API__ >= 9) || \
-        (defined(__mips__) && __ANDROID_API__ >= 12) || \
-        (defined(__i386__) && __ANDROID_API__ >= 17)
-#     define GTEST_HAS_CLONE 1
-#    else
-#     define GTEST_HAS_CLONE 0
-#    endif
-#  else
-#   define GTEST_HAS_CLONE 1
-#  endif
-# else
-#  define GTEST_HAS_CLONE 0
-# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
+    (defined(__mips__) && __ANDROID_API__ >= 12) ||                    \
+    (defined(__i386__) && __ANDROID_API__ >= 17)
+#define GTEST_HAS_CLONE 1
+#else
+#define GTEST_HAS_CLONE 0
+#endif
+#else
+#define GTEST_HAS_CLONE 1
+#endif
+#else
+#define GTEST_HAS_CLONE 0
+#endif  // GTEST_OS_LINUX && !defined(__ia64__)
 
 #endif  // GTEST_HAS_CLONE
 
@@ -582,11 +579,12 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_STREAM_REDIRECTION
 // By default, we assume that stream redirection is supported on all
 // platforms except known mobile ones.
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
-#  define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-#  define GTEST_HAS_STREAM_REDIRECTION 1
-# endif  // !GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+#define GTEST_HAS_STREAM_REDIRECTION 0
+#else
+#define GTEST_HAS_STREAM_REDIRECTION 1
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
 // Determines whether to support death tests.
@@ -596,8 +594,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
      (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW ||  \
      GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
      GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
-     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
-# define GTEST_HAS_DEATH_TEST 1
+     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU ||     \
+     GTEST_OS_GNU_HURD)
+#define GTEST_HAS_DEATH_TEST 1
 #endif
 
 // Determines whether to support type-driven tests.
@@ -606,8 +605,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Sun Pro CC, IBM Visual Age, and HP aCC support.
 #if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
     defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
+#define GTEST_HAS_TYPED_TEST 1
+#define GTEST_HAS_TYPED_TEST_P 1
 #endif
 
 // Determines whether the system compiler uses UTF-16 for encoding wide strings.
@@ -616,8 +615,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 
 // Determines whether test results can be streamed to a socket.
 #if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
-    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# define GTEST_CAN_STREAM_RESULTS_ 1
+    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD ||       \
+    GTEST_OS_GNU_HURD
+#define GTEST_CAN_STREAM_RESULTS_ 1
 #endif
 
 // Defines some utility macros.
@@ -631,9 +631,12 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 // The "switch (0) case 0:" idiom is used to suppress this.
 #ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
 #else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  switch (0)                          \
+  case 0:                             \
+  default:  // NOLINT
 #endif
 
 // Use this annotation at the end of a struct/class definition to
@@ -648,44 +651,32 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Also use it after a variable or parameter declaration to tell the
 // compiler the variable/parameter does not have to be used.
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
 #elif defined(__clang__)
-# if __has_attribute(unused)
-#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-# endif
+#if __has_attribute(unused)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#endif
 #endif
 #ifndef GTEST_ATTRIBUTE_UNUSED_
-# define GTEST_ATTRIBUTE_UNUSED_
+#define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
 // Use this annotation before a function that takes a printf format string.
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
-# if defined(__MINGW_PRINTF_FORMAT)
+#if defined(__MINGW_PRINTF_FORMAT)
 // MinGW has two different printf implementations. Ensure the format macro
 // matches the selected implementation. See
 // https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
-                                 first_to_check)))
-# else
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__printf__, string_index, first_to_check)))
-# endif
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((                                             \
+      __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
 #else
-# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
 #endif
-
-
-// A macro to disallow operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type) \
-  void operator=(type const &) = delete
-
-// A macro to disallow copy constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
-  type(type const &) = delete; \
-  GTEST_DISALLOW_ASSIGN_(type)
 
 // Tell the compiler to warn about unused return values for functions declared
 // with this macro.  The macro should be used on function declarations
@@ -693,9 +684,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 //   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
 #else
-# define GTEST_MUST_USE_RESULT_
+#define GTEST_MUST_USE_RESULT_
 #endif  // __GNUC__ && !COMPILER_ICC
 
 // MS C++ compiler emits warning when a conditional expression is compile time
@@ -706,10 +697,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // while (true) {
 // GTEST_INTENTIONAL_CONST_COND_POP_()
 // }
-# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
-# define GTEST_INTENTIONAL_CONST_COND_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 
 // Determine whether the compiler supports Microsoft's Structured Exception
 // Handling.  This is supported by several Windows compilers but generally
@@ -717,13 +707,13 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_SEH
 // The user didn't tell us, so we need to figure it out.
 
-# if defined(_MSC_VER) || defined(__BORLANDC__)
+#if defined(_MSC_VER) || defined(__BORLANDC__)
 // These two compilers are known to support SEH.
-#  define GTEST_HAS_SEH 1
-# else
+#define GTEST_HAS_SEH 1
+#else
 // Assume no SEH.
-#  define GTEST_HAS_SEH 0
-# endif
+#define GTEST_HAS_SEH 0
+#endif
 
 #endif  // GTEST_HAS_SEH
 
@@ -736,94 +726,112 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 
 #endif  // GTEST_IS_THREADSAFE
 
+#if GTEST_IS_THREADSAFE
+// Some platforms don't support including these threading related headers.
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#endif                         // GTEST_IS_THREADSAFE
+
 // GTEST_API_ qualifies all symbols that must be exported. The definitions below
 // are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
 // gtest/internal/custom/gtest-port.h
 #ifndef GTEST_API_
 
 #ifdef _MSC_VER
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllexport)
-# endif
+#if GTEST_LINKED_AS_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllimport)
+#elif GTEST_CREATE_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllexport)
+#endif
 #elif __GNUC__ >= 4 || defined(__clang__)
-# define GTEST_API_ __attribute__((visibility ("default")))
+#define GTEST_API_ __attribute__((visibility("default")))
 #endif  // _MSC_VER
 
 #endif  // GTEST_API_
 
 #ifndef GTEST_API_
-# define GTEST_API_
+#define GTEST_API_
 #endif  // GTEST_API_
 
 #ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
-# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
+#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
 #endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
 
 #ifdef __GNUC__
 // Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
+#define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+#define GTEST_NO_INLINE_
+#endif
+
+#if defined(__clang__)
+// Nested ifs to avoid triggering MSVC warning.
+#if __has_attribute(disable_tail_calls)
+// Ask the compiler not to perform tail call optimization inside
+// the marked function.
+#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls))
+#endif
+#elif __GNUC__
+#define GTEST_NO_TAIL_CALL_ \
+  __attribute__((optimize("no-optimize-sibling-calls")))
 #else
-# define GTEST_NO_INLINE_
+#define GTEST_NO_TAIL_CALL_
 #endif
 
 // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
 #if !defined(GTEST_HAS_CXXABI_H_)
-# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
-#  define GTEST_HAS_CXXABI_H_ 1
-# else
-#  define GTEST_HAS_CXXABI_H_ 0
-# endif
+#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define GTEST_HAS_CXXABI_H_ 1
+#else
+#define GTEST_HAS_CXXABI_H_ 0
+#endif
 #endif
 
 // A function level attribute to disable checking for use of uninitialized
 // memory when built with MemorySanitizer.
 #if defined(__clang__)
-# if __has_feature(memory_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
-       __attribute__((no_sanitize_memory))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-# endif  // __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __has_feature(memory_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 #endif  // __clang__
 
 // A function level attribute to disable AddressSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(address_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
-       __attribute__((no_sanitize_address))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-# endif  // __has_feature(address_sanitizer)
+#if __has_feature(address_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+  __attribute__((no_sanitize_address))
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __has_feature(address_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable HWAddressSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(hwaddress_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
-       __attribute__((no_sanitize("hwaddress")))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-# endif  // __has_feature(hwaddress_sanitizer)
+#if __has_feature(hwaddress_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+  __attribute__((no_sanitize("hwaddress")))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __has_feature(hwaddress_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable ThreadSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(thread_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
-       __attribute__((no_sanitize_thread))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-# endif  // __has_feature(thread_sanitizer)
+#if __has_feature(thread_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __has_feature(thread_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 #endif  // __clang__
 
 namespace testing {
@@ -845,28 +853,37 @@ namespace internal {
 // Secret object, which is what we want.
 class Secret;
 
-// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile
-// time expression is true (in new code, use static_assert instead). For
-// example, you could use it to verify the size of a static array:
-//
-//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
-//                         names_incorrect_size);
-//
-// The second argument to the macro must be a valid C++ identifier. If the
-// expression is false, compiler will issue an error containing this identifier.
-#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
-
-// Evaluates to the number of elements in 'array'.
-#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
-
 // A helper for suppressing warnings on constant condition.  It just
 // returns 'condition'.
 GTEST_API_ bool IsTrue(bool condition);
 
 // Defines RE.
 
-#if GTEST_USES_PCRE
-// if used, PCRE is injected by custom/gtest-port.h
+#if GTEST_USES_RE2
+
+// This is almost `using RE = ::RE2`, except it is copy-constructible, and it
+// needs to disambiguate the `std::string`, `absl::string_view`, and `const
+// char*` constructors.
+class GTEST_API_ RE {
+ public:
+  RE(absl::string_view regex) : regex_(regex) {}                  // NOLINT
+  RE(const char* regex) : RE(absl::string_view(regex)) {}         // NOLINT
+  RE(const std::string& regex) : RE(absl::string_view(regex)) {}  // NOLINT
+  RE(const RE& other) : RE(other.pattern()) {}
+
+  const std::string& pattern() const { return regex_.pattern(); }
+
+  static bool FullMatch(absl::string_view str, const RE& re) {
+    return RE2::FullMatch(str, re.regex_);
+  }
+  static bool PartialMatch(absl::string_view str, const RE& re) {
+    return RE2::PartialMatch(str, re.regex_);
+  }
+
+ private:
+  RE2 regex_;
+};
+
 #elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
 
 // A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
@@ -905,21 +922,19 @@ class GTEST_API_ RE {
   const char* pattern_;
   bool is_valid_;
 
-# if GTEST_USES_POSIX_RE
+#if GTEST_USES_POSIX_RE
 
   regex_t full_regex_;     // For FullMatch().
   regex_t partial_regex_;  // For PartialMatch().
 
-# else  // GTEST_USES_SIMPLE_RE
+#else  // GTEST_USES_SIMPLE_RE
 
   const char* full_pattern_;  // For FullMatch();
 
-# endif
-
-  GTEST_DISALLOW_ASSIGN_(RE);
+#endif
 };
 
-#endif  // GTEST_USES_PCRE
+#endif  // ::testing::internal::RE implementation
 
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
@@ -937,12 +952,7 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
 //   LogToStderr()  - directs all log messages to stderr.
 //   FlushInfoLog() - flushes informational log messages.
 
-enum GTestLogSeverity {
-  GTEST_INFO,
-  GTEST_WARNING,
-  GTEST_ERROR,
-  GTEST_FATAL
-};
+enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
 
 // Formats log entry severity, provides a stream object for streaming the
 // log message, and terminates the message with a newline when going out of
@@ -959,14 +969,16 @@ class GTEST_API_ GTestLog {
  private:
   const GTestLogSeverity severity_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+  GTestLog(const GTestLog&) = delete;
+  GTestLog& operator=(const GTestLog&) = delete;
 };
 
 #if !defined(GTEST_LOG_)
 
-# define GTEST_LOG_(severity) \
-    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                  __FILE__, __LINE__).GetStream()
+#define GTEST_LOG_(severity)                                           \
+  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                __FILE__, __LINE__)                    \
+      .GetStream()
 
 inline void LogToStderr() {}
 inline void FlushInfoLog() { fflush(nullptr); }
@@ -978,7 +990,7 @@ inline void FlushInfoLog() { fflush(nullptr); }
 //
 // GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
 // is not satisfied.
-//  Synopsys:
+//  Synopsis:
 //    GTEST_CHECK_(boolean_condition);
 //     or
 //    GTEST_CHECK_(boolean_condition) << "Additional message";
@@ -988,12 +1000,12 @@ inline void FlushInfoLog() { fflush(nullptr); }
 //    condition itself, plus additional message streamed into it, if any,
 //    and then it aborts the program. It aborts the program irrespective of
 //    whether it is built in the debug mode or not.
-# define GTEST_CHECK_(condition) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::IsTrue(condition)) \
-      ; \
-    else \
-      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#define GTEST_CHECK_(condition)               \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
+  if (::testing::internal::IsTrue(condition)) \
+    ;                                         \
+  else                                        \
+    GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
 #endif  // !defined(GTEST_CHECK_)
 
 // An all-mode assert to verify that the given POSIX-style function
@@ -1002,9 +1014,8 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // in {} if you need to use it as the only statement in an 'if'
 // branch.
 #define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call)) \
-    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
-                      << gtest_error
+  if (const int gtest_error = (posix_call))    \
+  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
 
 // Transforms "T" into "const T&" according to standard reference collapsing
 // rules (this is only needed as a backport for C++98 compilers that do not
@@ -1018,9 +1029,13 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // Note that the non-const reference will not have "const" added. This is
 // standard, and necessary so that "T" can always bind to "const T&".
 template <typename T>
-struct ConstRef { typedef const T& type; };
+struct ConstRef {
+  typedef const T& type;
+};
 template <typename T>
-struct ConstRef<T&> { typedef T& type; };
+struct ConstRef<T&> {
+  typedef T& type;
+};
 
 // The argument T must depend on some template parameters.
 #define GTEST_REFERENCE_TO_CONST_(T) \
@@ -1033,7 +1048,7 @@ struct ConstRef<T&> { typedef T& type; };
 // const Foo*).  When you use ImplicitCast_, the compiler checks that
 // the cast is safe.  Such explicit ImplicitCast_s are necessary in
 // surprisingly many situations where C++ demands an exact type match
-// instead of an argument type convertable to a target type.
+// instead of an argument type convertible to a target type.
 //
 // The syntax for using ImplicitCast_ is the same as for static_cast:
 //
@@ -1046,8 +1061,10 @@ struct ConstRef<T&> { typedef T& type; };
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., implicit_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
+template <typename To>
+inline To ImplicitCast_(To x) {
+  return x;
+}
 
 // When you upcast (that is, cast a pointer from type Foo to type
 // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
@@ -1070,17 +1087,17 @@ inline To ImplicitCast_(To x) { return x; }
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., down_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) {  // so we only accept pointers
+template <typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {         // so we only accept pointers
   // Ensures that To is a sub-type of From *.  This test is here only
   // for compile-time type checking, and has no overhead in an
   // optimized build at run-time, as it will be optimized away
   // completely.
   GTEST_INTENTIONAL_CONST_COND_PUSH_()
   if (false) {
-  GTEST_INTENTIONAL_CONST_COND_POP_()
-  const To to = nullptr;
-  ::testing::internal::ImplicitCast_<From*>(to);
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = nullptr;
+    ::testing::internal::ImplicitCast_<From*>(to);
   }
 
 #if GTEST_HAS_RTTI
@@ -1145,71 +1162,8 @@ void ClearInjectableArgvs();
 
 // Defines synchronization primitives.
 #if GTEST_IS_THREADSAFE
-# if GTEST_HAS_PTHREAD
-// Sleeps for (roughly) n milliseconds.  This function is only for testing
-// Google Test's own constructs.  Don't use it in user tests, either
-// directly or indirectly.
-inline void SleepMilliseconds(int n) {
-  const timespec time = {
-    0,                  // 0 seconds.
-    n * 1000L * 1000L,  // And n ms.
-  };
-  nanosleep(&time, nullptr);
-}
-# endif  // GTEST_HAS_PTHREAD
-
-# if GTEST_HAS_NOTIFICATION_
-// Notification has already been imported into the namespace.
-// Nothing to do here.
-
-# elif GTEST_HAS_PTHREAD
-// Allows a controller thread to pause execution of newly created
-// threads until notified.  Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class Notification {
- public:
-  Notification() : notified_(false) {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
-  }
-  ~Notification() {
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  // Notifies all threads created with this notification to start. Must
-  // be called from the controller thread.
-  void Notify() {
-    pthread_mutex_lock(&mutex_);
-    notified_ = true;
-    pthread_mutex_unlock(&mutex_);
-  }
-
-  // Blocks until the controller thread notifies. Must be called from a test
-  // thread.
-  void WaitForNotification() {
-    for (;;) {
-      pthread_mutex_lock(&mutex_);
-      const bool notified = notified_;
-      pthread_mutex_unlock(&mutex_);
-      if (notified)
-        break;
-      SleepMilliseconds(10);
-    }
-  }
-
- private:
-  pthread_mutex_t mutex_;
-  bool notified_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-
-GTEST_API_ void SleepMilliseconds(int n);
 
+#if GTEST_OS_WINDOWS
 // Provides leak-safe Windows kernel handle ownership.
 // Used in death tests and in threading support.
 class GTEST_API_ AutoHandle {
@@ -1236,8 +1190,18 @@ class GTEST_API_ AutoHandle {
 
   Handle handle_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+  AutoHandle(const AutoHandle&) = delete;
+  AutoHandle& operator=(const AutoHandle&) = delete;
 };
+#endif
+
+#if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+#else
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
 
 // Allows a controller thread to pause execution of newly created
 // threads until notified.  Instances of this class must be created
@@ -1245,23 +1209,40 @@ class GTEST_API_ AutoHandle {
 //
 // This class is only for testing Google Test's own constructs. Do not
 // use it in user tests, either directly or indirectly.
+// TODO(b/203539622): Replace unconditionally with absl::Notification.
 class GTEST_API_ Notification {
  public:
-  Notification();
-  void Notify();
-  void WaitForNotification();
+  Notification() : notified_(false) {}
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
 
- private:
-  AutoHandle event_;
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    std::lock_guard<std::mutex> lock(mu_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    std::unique_lock<std::mutex> lock(mu_);
+    cv_.wait(lock, [this]() { return notified_; });
+  }
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool notified_;
 };
-# endif  // GTEST_HAS_NOTIFICATION_
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+#endif  // GTEST_HAS_NOTIFICATION_
 
 // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
 // defined, but we don't want to use MinGW's pthreads implementation, which
 // has conformance problems with some versions of the POSIX standard.
-# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
 
 // As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
 // Consequently, it cannot select a correct instantiation of ThreadWithParam
@@ -1337,16 +1318,17 @@ class ThreadWithParam : public ThreadWithParamBase {
                    // finished.
   pthread_t thread_;  // The native thread object.
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
 };
-# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
-         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+        // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
-# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 // Mutex and ThreadLocal have already been imported into the namespace.
 // Nothing to do here.
 
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 // Mutex implements mutex on Windows platforms.  It is used in conjunction
 // with class MutexLock:
@@ -1400,14 +1382,15 @@ class GTEST_API_ Mutex {
   long critical_section_init_phase_;  // NOLINT
   GTEST_CRITICAL_SECTION* critical_section_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::Mutex mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1416,15 +1399,15 @@ class GTEST_API_ Mutex {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(Mutex* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
   Mutex* const mutex_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
 };
 
 typedef GTestMutexLock MutexLock;
@@ -1451,7 +1434,8 @@ class ThreadLocalBase {
   virtual ~ThreadLocalBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+  ThreadLocalBase(const ThreadLocalBase&) = delete;
+  ThreadLocalBase& operator=(const ThreadLocalBase&) = delete;
 };
 
 // Maps a thread to a set of ThreadLocals that have values instantiated on that
@@ -1480,7 +1464,7 @@ class GTEST_API_ ThreadWithParamBase {
     virtual void Run() = 0;
   };
 
-  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start);
   virtual ~ThreadWithParamBase();
 
  private:
@@ -1494,30 +1478,26 @@ class ThreadWithParam : public ThreadWithParamBase {
   typedef void UserThreadFunc(T);
 
   ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
-      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
-  }
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
   virtual ~ThreadWithParam() {}
 
  private:
   class RunnableImpl : public Runnable {
    public:
-    RunnableImpl(UserThreadFunc* func, T param)
-        : func_(func),
-          param_(param) {
-    }
+    RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {}
     virtual ~RunnableImpl() {}
-    virtual void Run() {
-      func_(param_);
-    }
+    virtual void Run() { func_(param_); }
 
    private:
     UserThreadFunc* const func_;
     const T param_;
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+    RunnableImpl(const RunnableImpl&) = delete;
+    RunnableImpl& operator=(const RunnableImpl&) = delete;
   };
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
 };
 
 // Implements thread-local storage on Windows systems.
@@ -1554,7 +1534,7 @@ class ThreadLocal : public ThreadLocalBase {
   explicit ThreadLocal(const T& value)
       : default_factory_(new InstanceValueHolderFactory(value)) {}
 
-  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+  ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
 
   T* pointer() { return GetOrCreateValue(); }
   const T* pointer() const { return GetOrCreateValue(); }
@@ -1573,16 +1553,17 @@ class ThreadLocal : public ThreadLocalBase {
 
    private:
     T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
   };
 
-
   T* GetOrCreateValue() const {
     return static_cast<ValueHolder*>(
-        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+               ThreadLocalRegistry::GetValueOnCurrentThread(this))
+        ->pointer();
   }
 
-  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+  ThreadLocalValueHolderBase* NewValueForCurrentThread() const override {
     return default_factory_->MakeNewHolder();
   }
 
@@ -1593,37 +1574,43 @@ class ThreadLocal : public ThreadLocalBase {
     virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
   };
 
   class DefaultValueHolderFactory : public ValueHolderFactory {
    public:
     DefaultValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+    ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
   };
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
    public:
     explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
-    virtual ValueHolder* MakeNewHolder() const {
+    ValueHolder* MakeNewHolder() const override {
       return new ValueHolder(value_);
     }
 
    private:
     const T value_;  // The value for each thread.
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
   };
 
   std::unique_ptr<ValueHolderFactory> default_factory_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
 };
 
-# elif GTEST_HAS_PTHREAD
+#elif GTEST_HAS_PTHREAD
 
 // MutexBase and Mutex implement mutex on pthreads-based platforms.
 class MutexBase {
@@ -1670,8 +1657,8 @@ class MutexBase {
 };
 
 // Forward-declares a static mutex.
-#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-     extern ::testing::internal::MutexBase mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::MutexBase mutex
 
 // Defines and statically (i.e. at link time) initializes a static mutex.
 // The initialization list here does not explicitly initialize each field,
@@ -1690,12 +1677,11 @@ class Mutex : public MutexBase {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
     has_owner_ = false;
   }
-  ~Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
-  }
+  ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
 };
 
 // We cannot name this class MutexLock because the ctor declaration would
@@ -1705,15 +1691,15 @@ class Mutex : public MutexBase {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(MutexBase* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
   MutexBase* const mutex_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
 };
 
 typedef GTestMutexLock MutexLock;
@@ -1770,7 +1756,8 @@ class GTEST_API_ ThreadLocal {
 
    private:
     T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
   };
 
   static pthread_key_t CreateKey() {
@@ -1802,39 +1789,45 @@ class GTEST_API_ ThreadLocal {
     virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
   };
 
   class DefaultValueHolderFactory : public ValueHolderFactory {
    public:
     DefaultValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+    ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
   };
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
    public:
     explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
-    virtual ValueHolder* MakeNewHolder() const {
+    ValueHolder* MakeNewHolder() const override {
       return new ValueHolder(value_);
     }
 
    private:
     const T value_;  // The value for each thread.
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
   };
 
   // A key pthreads uses for looking up per-thread values.
   const pthread_key_t key_;
   std::unique_ptr<ValueHolderFactory> default_factory_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
 };
 
-# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
 #else  // GTEST_IS_THREADSAFE
 
@@ -1851,10 +1844,10 @@ class Mutex {
   void AssertHeld() const {}
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
   extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1877,6 +1870,7 @@ class GTEST_API_ ThreadLocal {
   const T* pointer() const { return &value_; }
   const T& get() const { return value_; }
   void set(const T& value) { value_ = value; }
+
  private:
   T value_;
 };
@@ -1887,18 +1881,12 @@ class GTEST_API_ ThreadLocal {
 // we cannot detect it.
 GTEST_API_ size_t GetThreadCount();
 
-template <bool B>
-using bool_constant = std::integral_constant<bool, B>;
-
 #if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
-// The biggest signed integer type the compiler supports.
-typedef __int64 BiggestInt;
+#define GTEST_PATH_SEP_ "\\"
+#define GTEST_HAS_ALT_PATH_SEP_ 1
 #else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
-typedef long long BiggestInt;  // NOLINT
+#define GTEST_PATH_SEP_ "/"
+#define GTEST_HAS_ALT_PATH_SEP_ 0
 #endif  // GTEST_OS_WINDOWS
 
 // Utilities for char.
@@ -1929,6 +1917,19 @@ inline bool IsUpper(char ch) {
 inline bool IsXDigit(char ch) {
   return isxdigit(static_cast<unsigned char>(ch)) != 0;
 }
+#ifdef __cpp_char8_t
+inline bool IsXDigit(char8_t ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+#endif
+inline bool IsXDigit(char16_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+inline bool IsXDigit(char32_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
 inline bool IsXDigit(wchar_t ch) {
   const unsigned char low_byte = static_cast<unsigned char>(ch);
   return ch == low_byte && isxdigit(low_byte) != 0;
@@ -1943,8 +1944,7 @@ inline char ToUpper(char ch) {
 
 inline std::string StripTrailingSpaces(std::string str) {
   std::string::iterator it = str.end();
-  while (it != str.begin() && IsSpace(*--it))
-    it = str.erase(it);
+  while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
   return str;
 }
 
@@ -1962,43 +1962,58 @@ namespace posix {
 
 typedef struct _stat StatStruct;
 
-# ifdef __BORLANDC__
-inline int IsATTY(int fd) { return isatty(fd); }
+#ifdef __BORLANDC__
+inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return strdup(src); }
-# else  // !__BORLANDC__
-#  if GTEST_OS_WINDOWS_MOBILE
-inline int IsATTY(int /* fd */) { return 0; }
-#  else
-inline int IsATTY(int fd) { return _isatty(fd); }
-#  endif  // GTEST_OS_WINDOWS_MOBILE
+#else  // !__BORLANDC__
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
+inline int DoIsATTY(int /* fd */) { return 0; }
+#else
+inline int DoIsATTY(int fd) { return _isatty(fd); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return _stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return _strdup(src); }
-# endif  // __BORLANDC__
+#endif  // __BORLANDC__
 
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
 inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
 // Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
 // time and thus not defined there.
-# else
+#else
 inline int FileNo(FILE* file) { return _fileno(file); }
 inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
 inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
-  return (_S_IFDIR & st.st_mode) != 0;
+inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#elif GTEST_OS_ESP8266
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) {
+  // stat function not implemented on ESP8266
+  return 0;
+}
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
 }
-# endif  // GTEST_OS_WINDOWS_MOBILE
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 #else
 
 typedef struct stat StatStruct;
 
 inline int FileNo(FILE* file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return strcasecmp(s1, s2);
@@ -2009,26 +2024,42 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 #endif  // GTEST_OS_WINDOWS
 
+inline int IsATTY(int fd) {
+  // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout
+  // to a file on Linux), which is unexpected, so save the previous value, and
+  // restore it after the call.
+  int savedErrno = errno;
+  int isAttyValue = DoIsATTY(fd);
+  errno = savedErrno;
+
+  return isAttyValue;
+}
+
 // Functions deprecated by MSVC 8.0.
 
 GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
 
-inline const char* StrNCpy(char* dest, const char* src, size_t n) {
-  return strncpy(dest, src, n);
-}
-
 // ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
 // StrError() aren't needed on Windows CE at this time and thus not
 // defined there.
 
-#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_ESP8266 && !GTEST_OS_XTENSA
 inline int ChDir(const char* dir) { return chdir(dir); }
 #endif
 inline FILE* FOpen(const char* path, const char* mode) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+  struct wchar_codecvt : public std::codecvt<wchar_t, char, std::mbstate_t> {};
+  std::wstring_convert<wchar_codecvt> converter;
+  std::wstring wide_path = converter.from_bytes(path);
+  std::wstring wide_mode = converter.from_bytes(mode);
+  return _wfopen(wide_path.c_str(), wide_mode.c_str());
+#else   // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
   return fopen(path, mode);
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
 }
 #if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+inline FILE* FReopen(const char* path, const char* mode, FILE* stream) {
   return freopen(path, mode, stream);
 }
 inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
@@ -2045,8 +2076,9 @@ inline int Close(int fd) { return close(fd); }
 inline const char* StrError(int errnum) { return strerror(errnum); }
 #endif
 inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
-  // We are on Windows CE, which has no environment variables.
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+  // We are on an embedded platform, which has no environment variables.
   static_cast<void>(name);  // To prevent 'unused argument' warning.
   return nullptr;
 #elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
@@ -2079,24 +2111,22 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_()
 // snprintf is a variadic function.
 #if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
 // MSVC 2005 and above support variadic macros.
-# define GTEST_SNPRINTF_(buffer, size, format, ...) \
-     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#define GTEST_SNPRINTF_(buffer, size, format, ...) \
+  _snprintf_s(buffer, size, size, format, __VA_ARGS__)
 #elif defined(_MSC_VER)
 // Windows CE does not define _snprintf_s
-# define GTEST_SNPRINTF_ _snprintf
+#define GTEST_SNPRINTF_ _snprintf
 #else
-# define GTEST_SNPRINTF_ snprintf
+#define GTEST_SNPRINTF_ snprintf
 #endif
 
-// The maximum number a BiggestInt can represent.  This definition
-// works no matter BiggestInt is represented in one's complement or
-// two's complement.
+// The biggest signed integer type the compiler supports.
 //
-// We cannot rely on numeric_limits in STL, as __int64 and long long
-// are not part of standard C++ and numeric_limits doesn't need to be
-// defined for them.
-const BiggestInt kMaxBiggestInt =
-    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+// long long is guaranteed to be at least 64-bits in C++11.
+using BiggestInt = long long;  // NOLINT
+
+// The maximum number a BiggestInt can represent.
+constexpr BiggestInt kMaxBiggestInt = (std::numeric_limits<BiggestInt>::max)();
 
 // This template class serves as a compile-time function from size to
 // type.  It maps a size in bytes to a primitive type with that
@@ -2121,87 +2151,122 @@ class TypeWithSize {
  public:
   // This prevents the user from using TypeWithSize<N> with incorrect
   // values of N.
-  typedef void UInt;
+  using UInt = void;
 };
 
 // The specialization for size 4.
 template <>
 class TypeWithSize<4> {
  public:
-  // unsigned int has size 4 in both gcc and MSVC.
-  //
-  // As base/basictypes.h doesn't compile on Windows, we cannot use
-  // uint32, uint64, and etc here.
-  typedef int Int;
-  typedef unsigned int UInt;
+  using Int = std::int32_t;
+  using UInt = std::uint32_t;
 };
 
 // The specialization for size 8.
 template <>
 class TypeWithSize<8> {
  public:
-#if GTEST_OS_WINDOWS
-  typedef __int64 Int;
-  typedef unsigned __int64 UInt;
-#else
-  typedef long long Int;  // NOLINT
-  typedef unsigned long long UInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
+  using Int = std::int64_t;
+  using UInt = std::uint64_t;
 };
 
 // Integer types of known sizes.
-typedef TypeWithSize<4>::Int Int32;
-typedef TypeWithSize<4>::UInt UInt32;
-typedef TypeWithSize<8>::Int Int64;
-typedef TypeWithSize<8>::UInt UInt64;
-typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+using TimeInMillis = int64_t;  // Represents time in milliseconds.
 
 // Utilities for command line flags and environment variables.
 
 // Macro for referencing flags.
 #if !defined(GTEST_FLAG)
-# define GTEST_FLAG(name) FLAGS_gtest_##name
+#define GTEST_FLAG_NAME_(name) gtest_##name
+#define GTEST_FLAG(name) FLAGS_gtest_##name
 #endif  // !defined(GTEST_FLAG)
 
-#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
-# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
-#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+// Pick a command line flags implementation.
+#if GTEST_HAS_ABSL
 
-#if !defined(GTEST_DECLARE_bool_)
-# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+  ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+  ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+  ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc)
 
 // Macros for declaring flags.
-# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-# define GTEST_DECLARE_int32_(name) \
-    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
-# define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+#define GTEST_DECLARE_bool_(name) \
+  ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_int32_(name) \
+  ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_string_(name) \
+  ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name))
+
+#define GTEST_FLAG_SAVER_ ::absl::FlagSaver
+
+#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name))
+#define GTEST_FLAG_SET(name, value) \
+  (void)(::absl::SetFlag(&GTEST_FLAG(name), value))
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0
+
+#else  // GTEST_HAS_ABSL
 
 // Macros for defining flags.
-# define GTEST_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_bool_(name, default_val, doc)  \
+  namespace testing {                               \
+  GTEST_API_ bool GTEST_FLAG(name) = (default_val); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_int32_(name, default_val, doc)         \
+  namespace testing {                                       \
+  GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \
+  }                                                         \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_string_(name, default_val, doc)         \
+  namespace testing {                                        \
+  GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \
+  }                                                          \
+  static_assert(true, "no-op to require trailing semicolon")
 
-#endif  // !defined(GTEST_DECLARE_bool_)
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name)          \
+  namespace testing {                      \
+  GTEST_API_ extern bool GTEST_FLAG(name); \
+  }                                        \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_int32_(name)                 \
+  namespace testing {                              \
+  GTEST_API_ extern std::int32_t GTEST_FLAG(name); \
+  }                                                \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_string_(name)                 \
+  namespace testing {                               \
+  GTEST_API_ extern ::std::string GTEST_FLAG(name); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+
+#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
+#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+
+#endif  // GTEST_HAS_ABSL
 
 // Thread annotations
 #if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
-# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-# define GTEST_LOCK_EXCLUDED_(locks)
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
 #endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
 
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
 // to *value and returns true; otherwise leaves *value unchanged and returns
 // false.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+GTEST_API_ bool ParseInt32(const Message& src_text, const char* str,
+                           int32_t* value);
 
-// Parses a bool/Int32/string from the environment variable
+// Parses a bool/int32_t/string from the environment variable
 // corresponding to the given Google Test flag.
 bool BoolFromGTestEnv(const char* flag, bool default_val);
-GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+GTEST_API_ int32_t Int32FromGTestEnv(const char* flag, int32_t default_val);
 std::string OutputFlagAlsoCheckEnvVar();
 const char* StringFromGTestEnv(const char* flag, const char* default_val);
 
@@ -2228,4 +2293,121 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 
 #endif  // !defined(GTEST_INTERNAL_DEPRECATED)
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#if GTEST_HAS_ABSL
+// Always use absl::any for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include "absl/types/any.h"
+namespace testing {
+namespace internal {
+using Any = ::absl::any;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<any>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::any for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include <any>
+namespace testing {
+namespace internal {
+using Any = ::std::any;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::any is not
+// supported.
+#endif  // __has_include(<any>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::optional for UniversalPrinter<> specializations if
+// googletest is built with absl support.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include "absl/types/optional.h"
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::absl::optional<T>;
+inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; }
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<optional>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::optional for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include <optional>
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::std::optional<T>;
+inline ::std::nullopt_t Nullopt() { return ::std::nullopt; }
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::optional is not
+// supported.
+#endif  // __has_include(<optional>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::string_view for Matcher<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include "absl/strings/string_view.h"
+namespace testing {
+namespace internal {
+using StringView = ::absl::string_view;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<string_view>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::string_view for Matcher<>
+// specializations.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include <string_view>
+namespace testing {
+namespace internal {
+using StringView = ::std::string_view;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::string_view is not
+// supported.
+#endif  // __has_include(<string_view>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::variant for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include "absl/types/variant.h"
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::absl::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<variant>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::variant for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include <variant>
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::std::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::variant is not supported.
+#endif  // __has_include(<variant>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-string.h b/third_party/googletest/src/include/gtest/internal/gtest-string.h
index 82aaa63bf4..cca2e1f2ad 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-string.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-string.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares the String class and functions used internally by
@@ -36,17 +36,21 @@
 // This header file is #included by gtest-internal.h.
 // It should not be #included by other files.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 
 #ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
+#include <mem.h>
 #endif
 
 #include <string.h>
+
+#include <cstdint>
 #include <string>
 
 #include "gtest/internal/gtest-port.h"
@@ -122,8 +126,7 @@ class GTEST_API_ String {
   // Unlike strcasecmp(), this function can handle NULL argument(s).
   // A NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char* lhs,
-                                           const char* rhs);
+  static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs);
 
   // Compares two wide C strings, ignoring case.  Returns true if and only if
   // they have the same content.
@@ -142,24 +145,27 @@ class GTEST_API_ String {
 
   // Returns true if and only if the given string ends with the given suffix,
   // ignoring case. Any string is considered to end with an empty suffix.
-  static bool EndsWithCaseInsensitive(
-      const std::string& str, const std::string& suffix);
+  static bool EndsWithCaseInsensitive(const std::string& str,
+                                      const std::string& suffix);
 
   // Formats an int value as "%02d".
   static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
 
+  // Formats an int value to given width with leading zeros.
+  static std::string FormatIntWidthN(int value, int width);
+
   // Formats an int value as "%X".
   static std::string FormatHexInt(int value);
 
   // Formats an int value as "%X".
-  static std::string FormatHexUInt32(UInt32 value);
+  static std::string FormatHexUInt32(uint32_t value);
 
   // Formats a byte as "%02X".
   static std::string FormatByte(unsigned char value);
 
  private:
   String();  // Not meant to be instantiated.
-};  // class String
+};           // class String
 
 // Gets the content of the stringstream's buffer as an std::string.  Each '\0'
 // character in the buffer is replaced with "\\0".
@@ -168,4 +174,4 @@ GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
index 3d7542d1fb..6bc02a7de3 100644
--- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
+++ b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
@@ -1,7 +1,3 @@
-// This file was GENERATED by command:
-//     pump.py gtest-type-util.h.pump
-// DO NOT EDIT BY HAND!!!
-
 // Copyright 2008 Google Inc.
 // All Rights Reserved.
 //
@@ -32,27 +28,24 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 // Type utilities needed for implementing typed and type-parameterized
-// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently we support at most 50 types in a list, and at most 50
-// type-parameterized tests in one type-parameterized test suite.
-// Please contact googletestframework@googlegroups.com if you need
-// more.
+// tests.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 
 #include "gtest/internal/gtest-port.h"
 
 // #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
 // libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
+#include <cxxabi.h>
+#elif defined(__HP_aCC)
+#include <acxx_demangle.h>
+#endif  // GTEST_HASH_CXXABI_H_
 
 namespace testing {
 namespace internal {
@@ -73,1557 +66,46 @@ inline std::string CanonicalizeForStdLibVersioning(std::string s) {
   return s;
 }
 
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+#if GTEST_HAS_RTTI
+// GetTypeName(const std::type_info&) returns a human-readable name of type T.
+inline std::string GetTypeName(const std::type_info& type) {
+  const char* const name = type.name();
+#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
   int status = 0;
   // gcc's implementation of typeid(T).name() mangles the type name,
   // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
   using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
+#endif  // GTEST_HAS_CXXABI_H_
   char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
   const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
   return CanonicalizeForStdLibVersioning(name_str);
-#  else
+#else
   return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
-
-# else
+#endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+}
+#endif  // GTEST_HAS_RTTI
 
+// GetTypeName<T>() returns a human-readable name of type T if and only if
+// RTTI is enabled, otherwise it returns a dummy type name.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+  return GetTypeName(typeid(T));
+#else
   return "<type>";
-
-# endif  // GTEST_HAS_RTTI
+#endif  // GTEST_HAS_RTTI
 }
 
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// A unique type used as the default value for the arguments of class
-// template Types.  This allows us to simulate variadic templates
-// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
-// support directly.
+// A unique type indicating an empty node
 struct None {};
 
-// The following family of struct and struct templates are used to
-// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
-// represents a type list with N types (T1, T2, ..., and TN) in it.
-// Except for Types0, every struct in the family has two member types:
-// Head for the first type in the list, and Tail for the rest of the
-// list.
-
-// The empty type list.
-struct Types0 {};
-
-// Type lists of length 1, 2, 3, and so on.
-
-template <typename T1>
-struct Types1 {
-  typedef T1 Head;
-  typedef Types0 Tail;
-};
-template <typename T1, typename T2>
-struct Types2 {
-  typedef T1 Head;
-  typedef Types1<T2> Tail;
-};
-
-template <typename T1, typename T2, typename T3>
-struct Types3 {
-  typedef T1 Head;
-  typedef Types2<T2, T3> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types4 {
-  typedef T1 Head;
-  typedef Types3<T2, T3, T4> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types5 {
-  typedef T1 Head;
-  typedef Types4<T2, T3, T4, T5> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types6 {
-  typedef T1 Head;
-  typedef Types5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types7 {
-  typedef T1 Head;
-  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types8 {
-  typedef T1 Head;
-  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types9 {
-  typedef T1 Head;
-  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types10 {
-  typedef T1 Head;
-  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types11 {
-  typedef T1 Head;
-  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types12 {
-  typedef T1 Head;
-  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types13 {
-  typedef T1 Head;
-  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types14 {
-  typedef T1 Head;
-  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types15 {
-  typedef T1 Head;
-  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types16 {
-  typedef T1 Head;
-  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types17 {
-  typedef T1 Head;
-  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types18 {
-  typedef T1 Head;
-  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types19 {
-  typedef T1 Head;
-  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types20 {
-  typedef T1 Head;
-  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types21 {
-  typedef T1 Head;
-  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types22 {
-  typedef T1 Head;
-  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types23 {
-  typedef T1 Head;
-  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types24 {
-  typedef T1 Head;
-  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types25 {
-  typedef T1 Head;
-  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types26 {
-  typedef T1 Head;
-  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types27 {
-  typedef T1 Head;
-  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types28 {
-  typedef T1 Head;
-  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types29 {
-  typedef T1 Head;
-  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types30 {
-  typedef T1 Head;
-  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types31 {
-  typedef T1 Head;
-  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types32 {
-  typedef T1 Head;
-  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types33 {
-  typedef T1 Head;
-  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types34 {
-  typedef T1 Head;
-  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types35 {
-  typedef T1 Head;
-  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types36 {
-  typedef T1 Head;
-  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types37 {
-  typedef T1 Head;
-  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types38 {
-  typedef T1 Head;
-  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types39 {
-  typedef T1 Head;
-  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types40 {
-  typedef T1 Head;
-  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types41 {
-  typedef T1 Head;
-  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types42 {
-  typedef T1 Head;
-  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types43 {
-  typedef T1 Head;
-  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types44 {
-  typedef T1 Head;
-  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types45 {
-  typedef T1 Head;
-  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types46 {
-  typedef T1 Head;
-  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types47 {
-  typedef T1 Head;
-  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types48 {
-  typedef T1 Head;
-  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types49 {
-  typedef T1 Head;
-  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct Types50 {
-  typedef T1 Head;
-  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-}  // namespace internal
-
-// We don't want to require the users to write TypesN<...> directly,
-// as that would require them to count the length.  Types<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Types<int>
-// will appear as Types<int, None, None, ..., None> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Types<T1, ..., TN>, and Google Test will translate
-// that to TypesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Types template.
-template <typename T1 = internal::None, typename T2 = internal::None,
-    typename T3 = internal::None, typename T4 = internal::None,
-    typename T5 = internal::None, typename T6 = internal::None,
-    typename T7 = internal::None, typename T8 = internal::None,
-    typename T9 = internal::None, typename T10 = internal::None,
-    typename T11 = internal::None, typename T12 = internal::None,
-    typename T13 = internal::None, typename T14 = internal::None,
-    typename T15 = internal::None, typename T16 = internal::None,
-    typename T17 = internal::None, typename T18 = internal::None,
-    typename T19 = internal::None, typename T20 = internal::None,
-    typename T21 = internal::None, typename T22 = internal::None,
-    typename T23 = internal::None, typename T24 = internal::None,
-    typename T25 = internal::None, typename T26 = internal::None,
-    typename T27 = internal::None, typename T28 = internal::None,
-    typename T29 = internal::None, typename T30 = internal::None,
-    typename T31 = internal::None, typename T32 = internal::None,
-    typename T33 = internal::None, typename T34 = internal::None,
-    typename T35 = internal::None, typename T36 = internal::None,
-    typename T37 = internal::None, typename T38 = internal::None,
-    typename T39 = internal::None, typename T40 = internal::None,
-    typename T41 = internal::None, typename T42 = internal::None,
-    typename T43 = internal::None, typename T44 = internal::None,
-    typename T45 = internal::None, typename T46 = internal::None,
-    typename T47 = internal::None, typename T48 = internal::None,
-    typename T49 = internal::None, typename T50 = internal::None>
-struct Types {
-  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Types<internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types0 type;
-};
-template <typename T1>
-struct Types<T1, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types1<T1> type;
-};
-template <typename T1, typename T2>
-struct Types<T1, T2, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types2<T1, T2> type;
-};
-template <typename T1, typename T2, typename T3>
-struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types3<T1, T2, T3> type;
-};
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types4<T1, T2, T3, T4> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types5<T1, T2, T3, T4, T5> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, internal::None, internal::None, internal::None> {
-  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, internal::None, internal::None> {
-  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, T49, internal::None> {
-  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-namespace internal {
-
-# define GTEST_TEMPLATE_ template <typename T> class
+#define GTEST_TEMPLATE_ \
+  template <typename T> \
+  class
 
 // The template "selector" struct TemplateSel<Tmpl> is used to
 // represent Tmpl, which must be a class template with one type
@@ -1641,1695 +123,64 @@ struct TemplateSel {
   };
 };
 
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
-
-// A unique struct template used as the default value for the
-// arguments of class template Templates.  This allows us to simulate
-// variadic templates (e.g. Templates<int>, Templates<int, double>,
-// and etc), which C++ doesn't support directly.
-template <typename T>
-struct NoneT {};
-
-// The following family of struct and struct templates are used to
-// represent template lists.  In particular, TemplatesN<T1, T2, ...,
-// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
-// for Templates0, every struct in the family has two member types:
-// Head for the selector of the first template in the list, and Tail
-// for the rest of the list.
-
-// The empty template list.
-struct Templates0 {};
-
-// Template lists of length 1, 2, 3, and so on.
-
-template <GTEST_TEMPLATE_ T1>
-struct Templates1 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates0 Tail;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates2 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates1<T2> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates3 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates2<T2, T3> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates4 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates3<T2, T3, T4> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates5 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates4<T2, T3, T4, T5> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates6 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates7 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates8 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates9 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates10 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates11 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates12 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates13 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates14 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates15 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates16 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates17 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates18 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates19 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates20 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates21 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates22 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates23 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates24 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates25 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates26 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates27 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates28 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates29 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates30 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates31 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates32 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates33 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates34 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates35 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates36 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates37 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates38 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates39 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates40 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates41 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates42 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates43 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates44 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates45 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45> Tail;
-};
+#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
 
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates46 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46> Tail;
+template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
+struct Templates {
+  using Head = TemplateSel<Head_>;
+  using Tail = Templates<Tail_...>;
 };
 
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates47 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47> Tail;
+template <GTEST_TEMPLATE_ Head_>
+struct Templates<Head_> {
+  using Head = TemplateSel<Head_>;
+  using Tail = None;
 };
 
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates48 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48> Tail;
+// Tuple-like type lists
+template <typename Head_, typename... Tail_>
+struct Types {
+  using Head = Head_;
+  using Tail = Types<Tail_...>;
 };
 
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates49 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49> Tail;
+template <typename Head_>
+struct Types<Head_> {
+  using Head = Head_;
+  using Tail = None;
 };
 
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
-struct Templates50 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
+// Helper metafunctions to tell apart a single type from types
+// generated by ::testing::Types
+template <typename... Ts>
+struct ProxyTypeList {
+  using type = Types<Ts...>;
 };
 
+template <typename>
+struct is_proxy_type_list : std::false_type {};
 
-// We don't want to require the users to write TemplatesN<...> directly,
-// as that would require them to count the length.  Templates<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Templates<list>
-// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Templates<T1, ..., TN>, and Google Test will translate
-// that to TemplatesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Templates template.
-template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
-    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
-    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
-    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
-    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
-    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
-    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
-    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
-    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
-    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
-    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
-    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
-    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
-    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
-    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
-    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
-    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
-    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
-    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
-    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
-    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
-    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
-    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
-    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
-    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
-struct Templates {
-  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates0 type;
-};
-template <GTEST_TEMPLATE_ T1>
-struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates1<T1> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates2<T1, T2> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates3<T1, T2, T3> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates4<T1, T2, T3, T4> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates5<T1, T2, T3, T4, T5> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, NoneT, NoneT, NoneT> {
-  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, NoneT, NoneT> {
-  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, T49, NoneT> {
-  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-// The TypeList template makes it possible to use either a single type
-// or a Types<...> list in TYPED_TEST_SUITE() and
-// INSTANTIATE_TYPED_TEST_SUITE_P().
+template <typename... Ts>
+struct is_proxy_type_list<ProxyTypeList<Ts...>> : std::true_type {};
 
+// Generator which conditionally creates type lists.
+// It recognizes if a requested type list should be created
+// and prevents creating a new type list nested within another one.
 template <typename T>
-struct TypeList {
-  typedef Types1<T> type;
-};
+struct GenerateTypeList {
+ private:
+  using proxy = typename std::conditional<is_proxy_type_list<T>::value, T,
+                                          ProxyTypeList<T>>::type;
 
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> > {
-  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+ public:
+  using type = typename proxy::type;
 };
 
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 }  // namespace internal
+
+template <typename... Ts>
+using Types = internal::ProxyTypeList<Ts...>;
+
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump b/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump
deleted file mode 100644
index 5e31b7b320..0000000000
--- a/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump
+++ /dev/null
@@ -1,302 +0,0 @@
-$$ -*- mode: c++; -*-
-$var n = 50  $$ Maximum length of type lists we want to support.
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-// Type utilities needed for implementing typed and type-parameterized
-// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently we support at most $n types in a list, and at most $n
-// type-parameterized tests in one type-parameterized test suite.
-// Please contact googletestframework@googlegroups.com if you need
-// more.
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-#include "gtest/internal/gtest-port.h"
-
-// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
-// libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
-
-namespace testing {
-namespace internal {
-
-// Canonicalizes a given name with respect to the Standard C++ Library.
-// This handles removing the inline namespace within `std` that is
-// used by various standard libraries (e.g., `std::__1`).  Names outside
-// of namespace std are returned unmodified.
-inline std::string CanonicalizeForStdLibVersioning(std::string s) {
-  static const char prefix[] = "std::__";
-  if (s.compare(0, strlen(prefix), prefix) == 0) {
-    std::string::size_type end = s.find("::", strlen(prefix));
-    if (end != s.npos) {
-      // Erase everything between the initial `std` and the second `::`.
-      s.erase(strlen("std"), end - strlen("std"));
-    }
-  }
-  return s;
-}
-
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
-  int status = 0;
-  // gcc's implementation of typeid(T).name() mangles the type name,
-  // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
-  using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
-  char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
-  const std::string name_str(status == 0 ? readable_name : name);
-  free(readable_name);
-  return CanonicalizeForStdLibVersioning(name_str);
-#  else
-  return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
-
-# else
-
-  return "<type>";
-
-# endif  // GTEST_HAS_RTTI
-}
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// A unique type used as the default value for the arguments of class
-// template Types.  This allows us to simulate variadic templates
-// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
-// support directly.
-struct None {};
-
-// The following family of struct and struct templates are used to
-// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
-// represents a type list with N types (T1, T2, ..., and TN) in it.
-// Except for Types0, every struct in the family has two member types:
-// Head for the first type in the list, and Tail for the rest of the
-// list.
-
-// The empty type list.
-struct Types0 {};
-
-// Type lists of length 1, 2, 3, and so on.
-
-template <typename T1>
-struct Types1 {
-  typedef T1 Head;
-  typedef Types0 Tail;
-};
-
-$range i 2..n
-
-$for i [[
-$range j 1..i
-$range k 2..i
-template <$for j, [[typename T$j]]>
-struct Types$i {
-  typedef T1 Head;
-  typedef Types$(i-1)<$for k, [[T$k]]> Tail;
-};
-
-
-]]
-
-}  // namespace internal
-
-// We don't want to require the users to write TypesN<...> directly,
-// as that would require them to count the length.  Types<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Types<int>
-// will appear as Types<int, None, None, ..., None> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Types<T1, ..., TN>, and Google Test will translate
-// that to TypesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Types template.
-
-$range i 1..n
-template <$for i, [[typename T$i = internal::None]]>
-struct Types {
-  typedef internal::Types$n<$for i, [[T$i]]> type;
-};
-
-template <>
-struct Types<$for i, [[internal::None]]> {
-  typedef internal::Types0 type;
-};
-
-$range i 1..n-1
-$for i [[
-$range j 1..i
-$range k i+1..n
-template <$for j, [[typename T$j]]>
-struct Types<$for j, [[T$j]]$for k[[, internal::None]]> {
-  typedef internal::Types$i<$for j, [[T$j]]> type;
-};
-
-]]
-
-namespace internal {
-
-# define GTEST_TEMPLATE_ template <typename T> class
-
-// The template "selector" struct TemplateSel<Tmpl> is used to
-// represent Tmpl, which must be a class template with one type
-// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
-// as the type Tmpl<T>.  This allows us to actually instantiate the
-// template "selected" by TemplateSel<Tmpl>.
-//
-// This trick is necessary for simulating typedef for class templates,
-// which C++ doesn't support directly.
-template <GTEST_TEMPLATE_ Tmpl>
-struct TemplateSel {
-  template <typename T>
-  struct Bind {
-    typedef Tmpl<T> type;
-  };
-};
-
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
-
-// A unique struct template used as the default value for the
-// arguments of class template Templates.  This allows us to simulate
-// variadic templates (e.g. Templates<int>, Templates<int, double>,
-// and etc), which C++ doesn't support directly.
-template <typename T>
-struct NoneT {};
-
-// The following family of struct and struct templates are used to
-// represent template lists.  In particular, TemplatesN<T1, T2, ...,
-// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
-// for Templates0, every struct in the family has two member types:
-// Head for the selector of the first template in the list, and Tail
-// for the rest of the list.
-
-// The empty template list.
-struct Templates0 {};
-
-// Template lists of length 1, 2, 3, and so on.
-
-template <GTEST_TEMPLATE_ T1>
-struct Templates1 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates0 Tail;
-};
-
-$range i 2..n
-
-$for i [[
-$range j 1..i
-$range k 2..i
-template <$for j, [[GTEST_TEMPLATE_ T$j]]>
-struct Templates$i {
-  typedef TemplateSel<T1> Head;
-  typedef Templates$(i-1)<$for k, [[T$k]]> Tail;
-};
-
-
-]]
-
-// We don't want to require the users to write TemplatesN<...> directly,
-// as that would require them to count the length.  Templates<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Templates<list>
-// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Templates<T1, ..., TN>, and Google Test will translate
-// that to TemplatesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Templates template.
-
-$range i 1..n
-template <$for i, [[GTEST_TEMPLATE_ T$i = NoneT]]>
-struct Templates {
-  typedef Templates$n<$for i, [[T$i]]> type;
-};
-
-template <>
-struct Templates<$for i, [[NoneT]]> {
-  typedef Templates0 type;
-};
-
-$range i 1..n-1
-$for i [[
-$range j 1..i
-$range k i+1..n
-template <$for j, [[GTEST_TEMPLATE_ T$j]]>
-struct Templates<$for j, [[T$j]]$for k[[, NoneT]]> {
-  typedef Templates$i<$for j, [[T$j]]> type;
-};
-
-]]
-
-// The TypeList template makes it possible to use either a single type
-// or a Types<...> list in TYPED_TEST_SUITE() and
-// INSTANTIATE_TYPED_TEST_SUITE_P().
-
-template <typename T>
-struct TypeList {
-  typedef Types1<T> type;
-};
-
-
-$range i 1..n
-template <$for i, [[typename T$i]]>
-struct TypeList<Types<$for i, [[T$i]]> > {
-  typedef typename Types<$for i, [[T$i]]>::type type;
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/third_party/googletest/src/src/gtest-all.cc b/third_party/googletest/src/src/gtest-all.cc
index ad292905cf..2a70ed88c7 100644
--- a/third_party/googletest/src/src/gtest-all.cc
+++ b/third_party/googletest/src/src/gtest-all.cc
@@ -38,7 +38,7 @@
 #include "gtest/gtest.h"
 
 // The following lines pull in the real gtest *.cc files.
-#include "src/gtest.cc"
+#include "src/gtest-assertion-result.cc"
 #include "src/gtest-death-test.cc"
 #include "src/gtest-filepath.cc"
 #include "src/gtest-matchers.cc"
@@ -46,3 +46,4 @@
 #include "src/gtest-printers.cc"
 #include "src/gtest-test-part.cc"
 #include "src/gtest-typed-test.cc"
+#include "src/gtest.cc"
diff --git a/third_party/googletest/src/src/gtest-assertion-result.cc b/third_party/googletest/src/src/gtest-assertion-result.cc
new file mode 100644
index 0000000000..f1c0b10dc9
--- /dev/null
+++ b/third_party/googletest/src/src/gtest-assertion-result.cc
@@ -0,0 +1,77 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file defines the AssertionResult type.
+
+#include "gtest/gtest-assertion-result.h"
+
+#include <string>
+#include <utility>
+
+#include "gtest/gtest-message.h"
+
+namespace testing {
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != nullptr
+                   ? new ::std::string(*other.message_)
+                   : static_cast< ::std::string*>(nullptr)) {}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != nullptr) negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() { return AssertionResult(true); }
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() { return AssertionResult(false); }
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+}  // namespace testing
diff --git a/third_party/googletest/src/src/gtest-death-test.cc b/third_party/googletest/src/src/gtest-death-test.cc
index da09a1cfc2..e6abc6278a 100644
--- a/third_party/googletest/src/src/gtest-death-test.cc
+++ b/third_party/googletest/src/src/gtest-death-test.cc
@@ -32,51 +32,52 @@
 
 #include "gtest/gtest-death-test.h"
 
+#include <functional>
 #include <utility>
 
-#include "gtest/internal/gtest-port.h"
 #include "gtest/internal/custom/gtest.h"
+#include "gtest/internal/gtest-port.h"
 
 #if GTEST_HAS_DEATH_TEST
 
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-
-# if GTEST_OS_LINUX
-#  include <signal.h>
-# endif  // GTEST_OS_LINUX
-
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
-
-# if GTEST_OS_QNX
-#  include <spawn.h>
-# endif  // GTEST_OS_QNX
-
-# if GTEST_OS_FUCHSIA
-#  include <lib/fdio/fd.h>
-#  include <lib/fdio/io.h>
-#  include <lib/fdio/spawn.h>
-#  include <lib/zx/channel.h>
-#  include <lib/zx/port.h>
-#  include <lib/zx/process.h>
-#  include <lib/zx/socket.h>
-#  include <zircon/processargs.h>
-#  include <zircon/syscalls.h>
-#  include <zircon/syscalls/policy.h>
-#  include <zircon/syscalls/port.h>
-# endif  // GTEST_OS_FUCHSIA
+#if GTEST_OS_MAC
+#include <crt_externs.h>
+#endif  // GTEST_OS_MAC
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#if GTEST_OS_LINUX
+#include <signal.h>
+#endif  // GTEST_OS_LINUX
+
+#include <stdarg.h>
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/wait.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_QNX
+#include <spawn.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_FUCHSIA
+#include <lib/fdio/fd.h>
+#include <lib/fdio/io.h>
+#include <lib/fdio/spawn.h>
+#include <lib/zx/channel.h>
+#include <lib/zx/port.h>
+#include <lib/zx/process.h>
+#include <lib/zx/socket.h>
+#include <zircon/processargs.h>
+#include <zircon/syscalls.h>
+#include <zircon/syscalls/policy.h>
+#include <zircon/syscalls/port.h>
+#endif  // GTEST_OS_FUCHSIA
 
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -95,9 +96,12 @@ namespace testing {
 // used internally at Google, is "threadsafe".
 static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
 
+}  // namespace testing
+
 GTEST_DEFINE_string_(
     death_test_style,
-    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    testing::internal::StringFromGTestEnv("death_test_style",
+                                          testing::kDefaultDeathTestStyle),
     "Indicates how to run a death test in a forked child process: "
     "\"threadsafe\" (child process re-executes the test binary "
     "from the beginning, running only the specific death test) or "
@@ -106,7 +110,7 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_bool_(
     death_test_use_fork,
-    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    testing::internal::BoolFromGTestEnv("death_test_use_fork", false),
     "Instructs to use fork()/_exit() instead of clone() in death tests. "
     "Ignored and always uses fork() on POSIX systems where clone() is not "
     "implemented. Useful when running under valgrind or similar tools if "
@@ -116,7 +120,6 @@ GTEST_DEFINE_bool_(
     "work in 99% of the cases. Once valgrind is fixed, this flag will "
     "most likely be removed.");
 
-namespace internal {
 GTEST_DEFINE_string_(
     internal_run_death_test, "",
     "Indicates the file, line number, temporal index of "
@@ -125,7 +128,8 @@ GTEST_DEFINE_string_(
     "the '|' characters.  This flag is specified if and only if the "
     "current process is a sub-process launched for running a thread-safe "
     "death test.  FOR INTERNAL USE ONLY.");
-}  // namespace internal
+
+namespace testing {
 
 #if GTEST_HAS_DEATH_TEST
 
@@ -133,9 +137,9 @@ namespace internal {
 
 // Valid only for fast death tests. Indicates the code is running in the
 // child process of a fast style death test.
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 static bool g_in_fast_death_test_child = false;
-# endif
+#endif
 
 // Returns a Boolean value indicating whether the caller is currently
 // executing in the context of the death test child process.  Tools such as
@@ -143,16 +147,16 @@ static bool g_in_fast_death_test_child = false;
 // tests.  IMPORTANT: This is an internal utility.  Using it may break the
 // implementation of death tests.  User code MUST NOT use it.
 bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   // On Windows and Fuchsia, death tests are thread-safe regardless of the value
   // of the death_test_style flag.
-  return !GTEST_FLAG(internal_run_death_test).empty();
+  return !GTEST_FLAG_GET(internal_run_death_test).empty();
 
-# else
+#else
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe")
-    return !GTEST_FLAG(internal_run_death_test).empty();
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe")
+    return !GTEST_FLAG_GET(internal_run_death_test).empty();
   else
     return g_in_fast_death_test_child;
 #endif
@@ -161,40 +165,38 @@ bool InDeathTestChild() {
 }  // namespace internal
 
 // ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
 
 // ExitedWithCode function-call operator.
 bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return exit_status == exit_code_;
 
-# else
+#else
 
   return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
 
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 }
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
 
 // KilledBySignal function-call operator.
 bool KilledBySignal::operator()(int exit_status) const {
-#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   {
     bool result;
     if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
       return result;
     }
   }
-#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 namespace internal {
 
@@ -205,23 +207,23 @@ namespace internal {
 static std::string ExitSummary(int exit_code) {
   Message m;
 
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   m << "Exited with exit status " << exit_code;
 
-# else
+#else
 
   if (WIFEXITED(exit_code)) {
     m << "Exited with exit status " << WEXITSTATUS(exit_code);
   } else if (WIFSIGNALED(exit_code)) {
     m << "Terminated by signal " << WTERMSIG(exit_code);
   }
-#  ifdef WCOREDUMP
+#ifdef WCOREDUMP
   if (WCOREDUMP(exit_code)) {
     m << " (core dumped)";
   }
-#  endif
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return m.GetString();
 }
@@ -232,7 +234,7 @@ bool ExitedUnsuccessfully(int exit_status) {
   return !ExitedWithCode(0)(exit_status);
 }
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Generates a textual failure message when a death test finds more than
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
@@ -247,13 +249,13 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
     msg << "detected " << thread_count << " threads.";
   }
   msg << " See "
-         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "https://github.com/google/googletest/blob/master/docs/"
          "advanced.md#death-tests-and-threads"
       << " for more explanation and suggested solutions, especially if"
       << " this is the last message you see before your test times out.";
   return msg.GetString();
 }
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 // Flag characters for reporting a death test that did not die.
 static const char kDeathTestLived = 'L';
@@ -303,14 +305,14 @@ static void DeathTestAbort(const std::string& message) {
 
 // A replacement for CHECK that calls DeathTestAbort if the assertion
 // fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_(expression)                              \
+  do {                                                                   \
+    if (!::testing::internal::IsTrue(expression)) {                      \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression);                                \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
@@ -320,23 +322,23 @@ static void DeathTestAbort(const std::string& message) {
 // evaluates the expression as long as it evaluates to -1 and sets
 // errno to EINTR.  If the expression evaluates to -1 but errno is
 // something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression + " != -1"); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression)                      \
+  do {                                                                   \
+    int gtest_retval;                                                    \
+    do {                                                                 \
+      gtest_retval = (expression);                                       \
+    } while (gtest_retval == -1 && errno == EINTR);                      \
+    if (gtest_retval == -1) {                                            \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression + " != -1");                     \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // Returns the message describing the last system error in errno.
 std::string GetLastErrnoDescription() {
-    return errno == 0 ? "" : posix::StrError(errno);
+  return errno == 0 ? "" : posix::StrError(errno);
 }
 
 // This is called from a death test parent process to read a failure
@@ -369,8 +371,9 @@ static void FailFromInternalError(int fd) {
 DeathTest::DeathTest() {
   TestInfo* const info = GetUnitTestImpl()->current_test_info();
   if (info == nullptr) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
+    DeathTestAbort(
+        "Cannot run a death test outside of a TEST or "
+        "TEST_F construct");
   }
 }
 
@@ -499,9 +502,7 @@ void DeathTestImpl::ReadAndInterpretStatusByte() {
   set_read_fd(-1);
 }
 
-std::string DeathTestImpl::GetErrorLogs() {
-  return GetCapturedStderr();
-}
+std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
 
 // Signals that the death test code which should have exited, didn't.
 // Should be called only in a death test child process.
@@ -511,9 +512,9 @@ void DeathTestImpl::Abort(AbortReason reason) {
   // The parent process considers the death test to be a failure if
   // it finds any data in our pipe.  So, here we write a single flag byte
   // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+  const char status_ch = reason == TEST_DID_NOT_DIE       ? kDeathTestLived
+                         : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew
+                                                          : kDeathTestReturned;
 
   GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
   // We are leaking the descriptor here because on some platforms (i.e.,
@@ -532,7 +533,7 @@ void DeathTestImpl::Abort(AbortReason reason) {
 // much easier.
 static ::std::string FormatDeathTestOutput(const ::std::string& output) {
   ::std::string ret;
-  for (size_t at = 0; ; ) {
+  for (size_t at = 0;;) {
     const size_t line_end = output.find('\n', at);
     ret += "[  DEATH   ] ";
     if (line_end == ::std::string::npos) {
@@ -567,8 +568,7 @@ static ::std::string FormatDeathTestOutput(const ::std::string& output) {
 // the first failing condition, in the order given above, is the one that is
 // reported. Also sets the last death test message string.
 bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
+  if (!spawned()) return false;
 
   const std::string error_message = GetErrorLogs();
 
@@ -579,15 +579,18 @@ bool DeathTestImpl::Passed(bool status_ok) {
   switch (outcome()) {
     case LIVED:
       buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case THREW:
       buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case RETURNED:
       buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case DIED:
       if (status_ok) {
@@ -604,7 +607,8 @@ bool DeathTestImpl::Passed(bool status_ok) {
       } else {
         buffer << "    Result: died but not with expected exit code:\n"
                << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+               << "Actual msg:\n"
+               << FormatDeathTestOutput(error_message);
       }
       break;
     case IN_PROGRESS:
@@ -617,7 +621,7 @@ bool DeathTestImpl::Passed(bool status_ok) {
   return success;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // WindowsDeathTest implements death tests on Windows. Due to the
 // specifics of starting new processes on Windows, death tests there are
 // always threadsafe, and Google Test considers the
@@ -678,14 +682,12 @@ class WindowsDeathTest : public DeathTestImpl {
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   // Wait until the child either signals that it has acquired the write end
   // of the pipe or it dies.
-  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
+  const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()};
+  switch (::WaitForMultipleObjects(2, wait_handles,
                                    FALSE,  // Waits for any of the handles.
                                    INFINITE)) {
     case WAIT_OBJECT_0:
@@ -706,9 +708,8 @@ int WindowsDeathTest::Wait() {
   // returns immediately if the child has already exited, regardless of
   // whether previous calls to WaitForMultipleObjects synchronized on this
   // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
+  GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
+                          ::WaitForSingleObject(child_handle_.Get(), INFINITE));
   DWORD status_code;
   GTEST_DEATH_TEST_CHECK_(
       ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
@@ -741,12 +742,12 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
                                                  nullptr, TRUE};
   HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
+  GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
+                                       &handles_are_inheritable,
+                                       0)  // Default buffer size.
+                          != FALSE);
+  set_read_fd(
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
   write_handle_.Reset(write_handle);
   event_handle_.Reset(::CreateEvent(
       &handles_are_inheritable,
@@ -755,27 +756,26 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
       nullptr));  // The even is unnamed.
   GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
   const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
-      "=" + file_ + "|" + StreamableToString(line_) + "|" +
-      StreamableToString(death_test_index) + "|" +
+      std::string("--") + GTEST_FLAG_PREFIX_ +
+      "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) +
+      "|" + StreamableToString(death_test_index) + "|" +
       StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
       // size_t has the same width as pointers on both 32-bit and 64-bit
       // Windows platforms.
       // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
-      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
+      StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
 
   char executable_path[_MAX_PATH + 1];  // NOLINT
   GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
                                                                 executable_path,
                                                                 _MAX_PATH));
 
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
+  std::string command_line = std::string(::GetCommandLineA()) + " " +
+                             filter_flag + " \"" + internal_flag + "\"";
 
   DeathTest::set_last_death_test_message("");
 
@@ -795,8 +795,8 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   GTEST_DEATH_TEST_CHECK_(
       ::CreateProcessA(
           executable_path, const_cast<char*>(command_line.c_str()),
-          nullptr,  // Retuned process handle is not inheritable.
-          nullptr,  // Retuned thread handle is not inheritable.
+          nullptr,  // Returned process handle is not inheritable.
+          nullptr,  // Returned thread handle is not inheritable.
           TRUE,  // Child inherits all inheritable handles (for write_handle_).
           0x0,   // Default creation flags.
           nullptr,  // Inherit the parent's environment.
@@ -808,7 +808,7 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
 class FuchsiaDeathTest : public DeathTestImpl {
  public:
@@ -854,18 +854,13 @@ class Arguments {
   template <typename Str>
   void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
+         i != arguments.end(); ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char* const* Argv() {
-    return &args_[0];
-  }
+  char* const* Argv() { return &args_[0]; }
 
-  int size() {
-    return args_.size() - 1;
-  }
+  int size() { return static_cast<int>(args_.size()) - 1; }
 
  private:
   std::vector<char*> args_;
@@ -879,8 +874,7 @@ int FuchsiaDeathTest::Wait() {
   const int kSocketKey = 1;
   const int kExceptionKey = 2;
 
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   // Create a port to wait for socket/task/exception events.
   zx_status_t status_zx;
@@ -889,19 +883,18 @@ int FuchsiaDeathTest::Wait() {
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the child process to terminate.
-  status_zx = child_process_.wait_async(
-      port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE);
+  status_zx =
+      child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the socket to be readable or closed.
   status_zx = stderr_socket_.wait_async(
-      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-      ZX_WAIT_ASYNC_ONCE);
+      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for an exception.
-  status_zx = exception_channel_.wait_async(
-      port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE);
+  status_zx = exception_channel_.wait_async(port, kExceptionKey,
+                                            ZX_CHANNEL_READABLE, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   bool process_terminated = false;
@@ -931,9 +924,9 @@ int FuchsiaDeathTest::Wait() {
           size_t old_length = captured_stderr_.length();
           size_t bytes_read = 0;
           captured_stderr_.resize(old_length + kBufferSize);
-          status_zx = stderr_socket_.read(
-              0, &captured_stderr_.front() + old_length, kBufferSize,
-              &bytes_read);
+          status_zx =
+              stderr_socket_.read(0, &captured_stderr_.front() + old_length,
+                                  kBufferSize, &bytes_read);
           captured_stderr_.resize(old_length + bytes_read);
         } while (status_zx == ZX_OK);
         if (status_zx == ZX_ERR_PEER_CLOSED) {
@@ -941,8 +934,7 @@ int FuchsiaDeathTest::Wait() {
         } else {
           GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
           status_zx = stderr_socket_.wait_async(
-              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-              ZX_WAIT_ASYNC_ONCE);
+              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
           GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
         }
       } else {
@@ -955,12 +947,12 @@ int FuchsiaDeathTest::Wait() {
   ReadAndInterpretStatusByte();
 
   zx_info_process_t buffer;
-  status_zx = child_process_.get_info(
-      ZX_INFO_PROCESS, &buffer, sizeof(buffer), nullptr, nullptr);
+  status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer),
+                                      nullptr, nullptr);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
-  GTEST_DEATH_TEST_CHECK_(buffer.exited);
-  set_status(buffer.return_code);
+  GTEST_DEATH_TEST_CHECK_(buffer.flags & ZX_INFO_PROCESS_FLAG_EXITED);
+  set_status(static_cast<int>(buffer.return_code));
   return status();
 }
 
@@ -988,13 +980,12 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Build the child process command line.
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|"
-      + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index);
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    kInternalRunDeathTestFlag + "=" + file_ +
+                                    "|" + StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index);
   Arguments args;
   args.AddArguments(GetInjectableArgvs());
   args.AddArgument(filter_flag.c_str());
@@ -1017,8 +1008,7 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Create a socket pair will be used to receive the child process' stderr.
   zx::socket stderr_producer_socket;
-  status =
-      zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+  status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
   GTEST_DEATH_TEST_CHECK_(status >= 0);
   int stderr_producer_fd = -1;
   status =
@@ -1035,35 +1025,32 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Create a child job.
   zx_handle_t child_job = ZX_HANDLE_INVALID;
-  status = zx_job_create(zx_job_default(), 0, & child_job);
+  status = zx_job_create(zx_job_default(), 0, &child_job);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
   zx_policy_basic_t policy;
   policy.condition = ZX_POL_NEW_ANY;
   policy.policy = ZX_POL_ACTION_ALLOW;
-  status = zx_job_set_policy(
-      child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1);
+  status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
+                             &policy, 1);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Create an exception channel attached to the |child_job|, to allow
   // us to suppress the system default exception handler from firing.
-  status =
-      zx_task_create_exception_channel(
-          child_job, 0, exception_channel_.reset_and_get_address());
+  status = zx_task_create_exception_channel(
+      child_job, 0, exception_channel_.reset_and_get_address());
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Spawn the child process.
-  status = fdio_spawn_etc(
-      child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr,
-      2, spawn_actions, child_process_.reset_and_get_address(), nullptr);
+  status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
+                          args.Argv(), nullptr, 2, spawn_actions,
+                          child_process_.reset_and_get_address(), nullptr);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   set_spawned(true);
   return OVERSEE_TEST;
 }
 
-std::string FuchsiaDeathTest::GetErrorLogs() {
-  return captured_stderr_;
-}
+std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
 
 #else  // We are neither on Windows, nor on Fuchsia.
 
@@ -1094,8 +1081,7 @@ ForkingDeathTest::ForkingDeathTest(const char* a_statement,
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   ReadAndInterpretStatusByte();
 
@@ -1174,11 +1160,11 @@ class ExecDeathTest : public ForkingDeathTest {
  private:
   static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
     ::std::vector<std::string> args = GetInjectableArgvs();
-#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     ::std::vector<std::string> extra_args =
         GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
     args.insert(args.end(), extra_args.begin(), extra_args.end());
-#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     return args;
   }
   // The name of the file in which the death test is located.
@@ -1205,14 +1191,11 @@ class Arguments {
   template <typename Str>
   void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
+         i != arguments.end(); ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char* const* Argv() {
-    return &args_[0];
-  }
+  char* const* Argv() { return &args_[0]; }
 
  private:
   std::vector<char*> args_;
@@ -1225,21 +1208,9 @@ struct ExecDeathTestArgs {
   int close_fd;       // File descriptor to close; the read end of a pipe
 };
 
-#  if GTEST_OS_MAC
-inline char** GetEnviron() {
-  // When Google Test is built as a framework on MacOS X, the environ variable
-  // is unavailable. Apple's documentation (man environ) recommends using
-  // _NSGetEnviron() instead.
-  return *_NSGetEnviron();
-}
-#  else
-// Some POSIX platforms expect you to declare environ. extern "C" makes
-// it reside in the global namespace.
+#if GTEST_OS_QNX
 extern "C" char** environ;
-inline char** GetEnviron() { return environ; }
-#  endif  // GTEST_OS_MAC
-
-#  if !GTEST_OS_QNX
+#else   // GTEST_OS_QNX
 // The main function for a threadsafe-style death test child process.
 // This function is called in a clone()-ed process and thus must avoid
 // any potentially unsafe operations like malloc or libc functions.
@@ -1254,25 +1225,24 @@ static int ExecDeathTestChildMain(void* child_arg) {
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
-  // We can safely call execve() as it's a direct system call.  We
+  // We can safely call execv() as it's almost a direct system call. We
   // cannot use execvp() as it's a libc function and thus potentially
-  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // unsafe.  Since execv() doesn't search the PATH, the user must
   // invoke the test program via a valid path that contains at least
   // one path separator.
-  execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
-                 original_dir + " failed: " +
-                 GetLastErrnoDescription());
+  execv(args->argv[0], args->argv);
+  DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " + GetLastErrnoDescription());
   return EXIT_FAILURE;
 }
-#  endif  // !GTEST_OS_QNX
+#endif  // GTEST_OS_QNX
 
-#  if GTEST_HAS_CLONE
+#if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
 // grows.
 // This could be accomplished more elegantly by a single recursive
@@ -1284,24 +1254,29 @@ static int ExecDeathTestChildMain(void* child_arg) {
 // correct answer.
 static void StackLowerThanAddress(const void* ptr,
                                   bool* result) GTEST_NO_INLINE_;
+// Make sure sanitizers do not tamper with the stack here.
+// Ideally, we want to use `__builtin_frame_address` instead of a local variable
+// address with sanitizer disabled, but it does not work when the
+// compiler optimizes the stack frame out, which happens on PowerPC targets.
 // HWAddressSanitizer add a random tag to the MSB of the local variable address,
 // making comparison result unpredictable.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 static void StackLowerThanAddress(const void* ptr, bool* result) {
-  int dummy;
-  *result = (&dummy < ptr);
+  int dummy = 0;
+  *result = std::less<const void*>()(&dummy, ptr);
 }
 
 // Make sure AddressSanitizer does not tamper with the stack here.
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 static bool StackGrowsDown() {
-  int dummy;
+  int dummy = 0;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
-#  endif  // GTEST_HAS_CLONE
+#endif  // GTEST_HAS_CLONE
 
 // Spawns a child process with the same executable as the current process in
 // a thread-safe manner and instructs it to run the death test.  The
@@ -1311,10 +1286,10 @@ static bool StackGrowsDown() {
 // spawn(2) there instead.  The function dies with an error message if
 // anything goes wrong.
 static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
-  ExecDeathTestArgs args = { argv, close_fd };
+  ExecDeathTestArgs args = {argv, close_fd};
   pid_t child_pid = -1;
 
-#  if GTEST_OS_QNX
+#if GTEST_OS_QNX
   // Obtains the current directory and sets it to be closed in the child
   // process.
   const int cwd_fd = open(".", O_RDONLY);
@@ -1327,26 +1302,25 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
   int fd_flags;
   // Set close_fd to be closed after spawn.
   GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
-                                        fd_flags | FD_CLOEXEC));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
   struct inheritance inherit = {0};
   // spawn is a system call.
-  child_pid =
-      spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron());
+  child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
   // Restores the current working directory.
   GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
 
-#  else   // GTEST_OS_QNX
-#   if GTEST_OS_LINUX
+#else  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   // When a SIGPROF signal is received while fork() or clone() are executing,
   // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
   // it after the call to fork()/clone() is complete.
@@ -1355,16 +1329,16 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
   memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
   sigemptyset(&ignore_sigprof_action.sa_mask);
   ignore_sigprof_action.sa_handler = SIG_IGN;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
-      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-#   endif  // GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#endif  // GTEST_OS_LINUX
 
-#   if GTEST_HAS_CLONE
-  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+#if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG_GET(death_test_use_fork);
 
   if (!use_fork) {
     static const bool stack_grows_down = StackGrowsDown();
-    const auto stack_size = static_cast<size_t>(getpagesize());
+    const auto stack_size = static_cast<size_t>(getpagesize() * 2);
     // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
     void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE,
                              MAP_ANON | MAP_PRIVATE, -1, 0);
@@ -1379,7 +1353,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
     const size_t kMaxStackAlignment = 64;
     void* const stack_top =
         static_cast<char*>(stack) +
-            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+        (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
     GTEST_DEATH_TEST_CHECK_(
         static_cast<size_t>(stack_size) > kMaxStackAlignment &&
         reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
@@ -1388,19 +1362,19 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
 
     GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
   }
-#   else
+#else
   const bool use_fork = true;
-#   endif  // GTEST_HAS_CLONE
+#endif  // GTEST_HAS_CLONE
 
   if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
+    ExecDeathTestChildMain(&args);
+    _exit(0);
   }
-#  endif  // GTEST_OS_QNX
-#  if GTEST_OS_LINUX
+#endif  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   GTEST_DEATH_TEST_CHECK_SYSCALL_(
       sigaction(SIGPROF, &saved_sigprof_action, nullptr));
-#  endif  // GTEST_OS_LINUX
+#endif  // GTEST_OS_LINUX
 
   GTEST_DEATH_TEST_CHECK_(child_pid != -1);
   return child_pid;
@@ -1429,13 +1403,13 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
 
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|" + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index) + "|"
-      + StreamableToString(pipe_fd[1]);
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    "internal_run_death_test=" + file_ + "|" +
+                                    StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index) + "|" +
+                                    StreamableToString(pipe_fd[1]);
   Arguments args;
   args.AddArguments(GetArgvsForDeathTestChildProcess());
   args.AddArgument(filter_flag.c_str());
@@ -1456,7 +1430,7 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // Creates a concrete DeathTest-derived class that depends on the
 // --gtest_death_test_style flag, and sets the pointer pointed to
@@ -1470,15 +1444,15 @@ bool DefaultDeathTestFactory::Create(const char* statement,
   UnitTestImpl* const impl = GetUnitTestImpl();
   const InternalRunDeathTestFlag* const flag =
       impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
+  const int death_test_index =
+      impl->current_test_info()->increment_death_test_count();
 
   if (flag != nullptr) {
     if (death_test_index > flag->index()) {
       DeathTest::set_last_death_test_message(
-          "Death test count (" + StreamableToString(death_test_index)
-          + ") somehow exceeded expected maximum ("
-          + StreamableToString(flag->index()) + ")");
+          "Death test count (" + StreamableToString(death_test_index) +
+          ") somehow exceeded expected maximum (" +
+          StreamableToString(flag->index()) + ")");
       return false;
     }
 
@@ -1489,50 +1463,50 @@ bool DefaultDeathTestFactory::Create(const char* statement,
     }
   }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
   }
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
   }
 
-# else
+#else
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe") {
     *test = new ExecDeathTest(statement, std::move(matcher), file, line);
-  } else if (GTEST_FLAG(death_test_style) == "fast") {
+  } else if (GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new NoExecDeathTest(statement, std::move(matcher));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
   else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(
-        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
-        + "\" encountered");
+    DeathTest::set_last_death_test_message("Unknown death test style \"" +
+                                           GTEST_FLAG_GET(death_test_style) +
+                                           "\" encountered");
     return false;
   }
 
   return true;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
 // handle. This function is called in the child process only.
 static int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
+                                   size_t write_handle_as_size_t,
+                                   size_t event_handle_as_size_t) {
   AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
+                                                 FALSE,  // Non-inheritable.
+                                                 parent_process_id));
   if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
     DeathTestAbort("Unable to open parent process " +
                    StreamableToString(parent_process_id));
@@ -1540,8 +1514,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
 
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
   HANDLE dup_write_handle;
 
   // The newly initialized handle is accessible only in the parent
@@ -1563,9 +1536,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
   HANDLE dup_event_handle;
 
   if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
+                         ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
                          DUPLICATE_SAME_ACCESS)) {
     DeathTestAbort("Unable to duplicate the event handle " +
                    StreamableToString(event_handle_as_size_t) +
@@ -1587,61 +1558,57 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   return write_fd;
 }
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
 // initialized from the GTEST_FLAG(internal_run_death_test) flag if
 // the flag is specified; otherwise returns NULL.
 InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
+  if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr;
 
   // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
   // can use it here.
   int line = -1;
   int index = -1;
   ::std::vector< ::std::string> fields;
-  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields);
   int write_fd = -1;
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
   unsigned int parent_process_id = 0;
   size_t write_handle_as_size_t = 0;
   size_t event_handle_as_size_t = 0;
 
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+  if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &parent_process_id) ||
+      !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
+      !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
     DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
-                   GTEST_FLAG(internal_run_death_test));
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
+  write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
                                      event_handle_as_size_t);
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
-  if (fields.size() != 3
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
 
-# else
+#else
 
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
   return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
 }
diff --git a/third_party/googletest/src/src/gtest-filepath.cc b/third_party/googletest/src/src/gtest-filepath.cc
index bd7b99ff03..f6ee90cdb7 100644
--- a/third_party/googletest/src/src/gtest-filepath.cc
+++ b/third_party/googletest/src/src/gtest-filepath.cc
@@ -30,29 +30,31 @@
 #include "gtest/internal/gtest-filepath.h"
 
 #include <stdlib.h>
-#include "gtest/internal/gtest-port.h"
+
 #include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
+#include <windows.h>
 #elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
+#include <direct.h>
+#include <io.h>
 #else
-# include <limits.h>
-# include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif  // GTEST_OS_WINDOWS_MOBILE
+#include <limits.h>
+
+#include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif              // GTEST_OS_WINDOWS_MOBILE
 
 #include "gtest/internal/gtest-string.h"
 
 #if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
+#define GTEST_PATH_MAX_ _MAX_PATH
 #elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
+#define GTEST_PATH_MAX_ PATH_MAX
 #elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
 #else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
 #endif  // GTEST_OS_WINDOWS
 
 namespace testing {
@@ -66,16 +68,16 @@ namespace internal {
 const char kPathSeparator = '\\';
 const char kAlternatePathSeparator = '/';
 const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
 // Windows CE doesn't have a current directory. You should not use
 // the current directory in tests on Windows CE, but this at least
 // provides a reasonable fallback.
 const char kCurrentDirectoryString[] = "\\";
 // Windows CE doesn't define INVALID_FILE_ATTRIBUTES
 const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
+#else
 const char kCurrentDirectoryString[] = ".\\";
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 #else
 const char kPathSeparator = '/';
 const char kCurrentDirectoryString[] = "./";
@@ -92,23 +94,24 @@ static bool IsPathSeparator(char c) {
 
 // Returns the current working directory, or "" if unsuccessful.
 FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || ARDUINO || defined(ESP_PLATFORM)
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE ||         \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 || \
+    GTEST_OS_XTENSA
   // These platforms do not have a current directory, so we just return
   // something reasonable.
   return FilePath(kCurrentDirectoryString);
 #elif GTEST_OS_WINDOWS
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
   return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
 #else
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
   char* result = getcwd(cwd, sizeof(cwd));
-# if GTEST_OS_NACL
+#if GTEST_OS_NACL
   // getcwd will likely fail in NaCl due to the sandbox, so return something
   // reasonable. The user may have provided a shim implementation for getcwd,
   // however, so fallback only when failure is detected.
   return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
-# endif  // GTEST_OS_NACL
+#endif  // GTEST_OS_NACL
   return FilePath(result == nullptr ? "" : cwd);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
@@ -120,8 +123,8 @@ FilePath FilePath::GetCurrentDir() {
 FilePath FilePath::RemoveExtension(const char* extension) const {
   const std::string dot_extension = std::string(".") + extension;
   if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
-    return FilePath(pathname_.substr(
-        0, pathname_.length() - dot_extension.length()));
+    return FilePath(
+        pathname_.substr(0, pathname_.length() - dot_extension.length()));
   }
   return *this;
 }
@@ -177,15 +180,14 @@ FilePath FilePath::RemoveFileName() const {
 // than zero (e.g., 12), returns "dir/test_12.xml".
 // On Windows platform, uses \ as the separator rather than /.
 FilePath FilePath::MakeFileName(const FilePath& directory,
-                                const FilePath& base_name,
-                                int number,
+                                const FilePath& base_name, int number,
                                 const char* extension) {
   std::string file;
   if (number == 0) {
     file = base_name.string() + "." + extension;
   } else {
-    file = base_name.string() + "_" + StreamableToString(number)
-        + "." + extension;
+    file =
+        base_name.string() + "_" + StreamableToString(number) + "." + extension;
   }
   return ConcatPaths(directory, FilePath(file));
 }
@@ -194,8 +196,7 @@ FilePath FilePath::MakeFileName(const FilePath& directory,
 // On Windows, uses \ as the separator rather than /.
 FilePath FilePath::ConcatPaths(const FilePath& directory,
                                const FilePath& relative_path) {
-  if (directory.IsEmpty())
-    return relative_path;
+  if (directory.IsEmpty()) return relative_path;
   const FilePath dir(directory.RemoveTrailingPathSeparator());
   return FilePath(dir.string() + kPathSeparator + relative_path.string());
 }
@@ -206,10 +207,10 @@ bool FilePath::FileOrDirectoryExists() const {
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   return attributes != kInvalidFileAttributes;
 #else
-  posix::StatStruct file_stat;
+  posix::StatStruct file_stat{};
   return posix::Stat(pathname_.c_str(), &file_stat) == 0;
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
@@ -221,8 +222,8 @@ bool FilePath::DirectoryExists() const {
 #if GTEST_OS_WINDOWS
   // Don't strip off trailing separator if path is a root directory on
   // Windows (like "C:\\").
-  const FilePath& path(IsRootDirectory() ? *this :
-                                           RemoveTrailingPathSeparator());
+  const FilePath& path(IsRootDirectory() ? *this
+                                         : RemoveTrailingPathSeparator());
 #else
   const FilePath& path(*this);
 #endif
@@ -230,15 +231,15 @@ bool FilePath::DirectoryExists() const {
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   if ((attributes != kInvalidFileAttributes) &&
       (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
     result = true;
   }
 #else
-  posix::StatStruct file_stat;
-  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
-      posix::IsDir(file_stat);
+  posix::StatStruct file_stat{};
+  result =
+      posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
   return result;
@@ -259,10 +260,9 @@ bool FilePath::IsAbsolutePath() const {
   const char* const name = pathname_.c_str();
 #if GTEST_OS_WINDOWS
   return pathname_.length() >= 3 &&
-     ((name[0] >= 'a' && name[0] <= 'z') ||
-      (name[0] >= 'A' && name[0] <= 'Z')) &&
-     name[1] == ':' &&
-     IsPathSeparator(name[2]);
+         ((name[0] >= 'a' && name[0] <= 'z') ||
+          (name[0] >= 'A' && name[0] <= 'Z')) &&
+         name[1] == ':' && IsPathSeparator(name[2]);
 #else
   return IsPathSeparator(name[0]);
 #endif
@@ -320,9 +320,12 @@ bool FilePath::CreateFolder() const {
   FilePath removed_sep(this->RemoveTrailingPathSeparator());
   LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
   int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
-  delete [] unicode;
+  delete[] unicode;
 #elif GTEST_OS_WINDOWS
   int result = _mkdir(pathname_.c_str());
+#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+  // do nothing
+  int result = 0;
 #else
   int result = mkdir(pathname_.c_str(), 0777);
 #endif  // GTEST_OS_WINDOWS_MOBILE
@@ -337,42 +340,27 @@ bool FilePath::CreateFolder() const {
 // name, otherwise return the name string unmodified.
 // On Windows platform, uses \ as the separator, other platforms use /.
 FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory()
-      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
-      : *this;
+  return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+                       : *this;
 }
 
 // Removes any redundant separators that might be in the pathname.
 // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
 // redundancies that might be in a pathname involving "." or "..".
 void FilePath::Normalize() {
-  if (pathname_.c_str() == nullptr) {
-    pathname_ = "";
-    return;
-  }
-  const char* src = pathname_.c_str();
-  char* const dest = new char[pathname_.length() + 1];
-  char* dest_ptr = dest;
-  memset(dest_ptr, 0, pathname_.length() + 1);
-
-  while (*src != '\0') {
-    *dest_ptr = *src;
-    if (!IsPathSeparator(*src)) {
-      src++;
+  auto out = pathname_.begin();
+
+  for (const char character : pathname_) {
+    if (!IsPathSeparator(character)) {
+      *(out++) = character;
+    } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) {
+      *(out++) = kPathSeparator;
     } else {
-#if GTEST_HAS_ALT_PATH_SEP_
-      if (*dest_ptr == kAlternatePathSeparator) {
-        *dest_ptr = kPathSeparator;
-      }
-#endif
-      while (IsPathSeparator(*src))
-        src++;
+      continue;
     }
-    dest_ptr++;
   }
-  *dest_ptr = '\0';
-  pathname_ = dest;
-  delete[] dest;
+
+  pathname_.erase(out, pathname_.end());
 }
 
 }  // namespace internal
diff --git a/third_party/googletest/src/src/gtest-internal-inl.h b/third_party/googletest/src/src/gtest-internal-inl.h
index 8ed70daab0..0b9e929c68 100644
--- a/third_party/googletest/src/src/gtest-internal-inl.h
+++ b/third_party/googletest/src/src/gtest-internal-inl.h
@@ -31,17 +31,18 @@
 // This file contains purely Google Test's internal implementation.  Please
 // DO NOT #INCLUDE IT IN A USER PROGRAM.
 
-#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
-#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+#ifndef GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
+#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
 
 #ifndef _WIN32_WCE
-# include <errno.h>
+#include <errno.h>
 #endif  // !_WIN32_WCE
 #include <stddef.h>
 #include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
 #include <string.h>  // For memmove.
 
 #include <algorithm>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
@@ -49,22 +50,20 @@
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
+#include <arpa/inet.h>  // NOLINT
+#include <netdb.h>      // NOLINT
 #endif
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
+#include <windows.h>  // NOLINT
+#endif                // GTEST_OS_WINDOWS
 
-#include "gtest/gtest.h"
 #include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
-namespace testing {
-
 // Declares the flags.
 //
 // We don't want the users to modify this flag in the code, but want
@@ -72,30 +71,13 @@ namespace testing {
 // declare it here as opposed to in gtest.h.
 GTEST_DECLARE_bool_(death_test_use_fork);
 
+namespace testing {
 namespace internal {
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
 GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
 
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kPrintTimeFlag[] = "print_time";
-const char kPrintUTF8Flag[] = "print_utf8";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-const char kFlagfileFlag[] = "flagfile";
-
 // A valid random seed must be in [1, kMaxRandomSeed].
 const int kMaxRandomSeed = 99999;
 
@@ -122,21 +104,21 @@ GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, Int32* value);
+GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value);
 
 // Returns a random seed in range [1, kMaxRandomSeed] based on the
 // given --gtest_random_seed flag value.
-inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
+inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
+  const unsigned int raw_seed =
+      (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
+                              : static_cast<unsigned int>(random_seed_flag);
 
   // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
   // it's easy to type.
   const int normalized_seed =
       static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+                       static_cast<unsigned int>(kMaxRandomSeed)) +
+      1;
   return normalized_seed;
 }
 
@@ -157,46 +139,54 @@ class GTestFlagSaver {
  public:
   // The c'tor.
   GTestFlagSaver() {
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    color_ = GTEST_FLAG(color);
-    death_test_style_ = GTEST_FLAG(death_test_style);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    filter_ = GTEST_FLAG(filter);
-    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
-    list_tests_ = GTEST_FLAG(list_tests);
-    output_ = GTEST_FLAG(output);
-    print_time_ = GTEST_FLAG(print_time);
-    print_utf8_ = GTEST_FLAG(print_utf8);
-    random_seed_ = GTEST_FLAG(random_seed);
-    repeat_ = GTEST_FLAG(repeat);
-    shuffle_ = GTEST_FLAG(shuffle);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+    also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG_GET(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions);
+    color_ = GTEST_FLAG_GET(color);
+    death_test_style_ = GTEST_FLAG_GET(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork);
+    fail_fast_ = GTEST_FLAG_GET(fail_fast);
+    filter_ = GTEST_FLAG_GET(filter);
+    internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test);
+    list_tests_ = GTEST_FLAG_GET(list_tests);
+    output_ = GTEST_FLAG_GET(output);
+    brief_ = GTEST_FLAG_GET(brief);
+    print_time_ = GTEST_FLAG_GET(print_time);
+    print_utf8_ = GTEST_FLAG_GET(print_utf8);
+    random_seed_ = GTEST_FLAG_GET(random_seed);
+    repeat_ = GTEST_FLAG_GET(repeat);
+    recreate_environments_when_repeating_ =
+        GTEST_FLAG_GET(recreate_environments_when_repeating);
+    shuffle_ = GTEST_FLAG_GET(shuffle);
+    stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG_GET(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure);
   }
 
   // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
   ~GTestFlagSaver() {
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(color) = color_;
-    GTEST_FLAG(death_test_style) = death_test_style_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(filter) = filter_;
-    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(output) = output_;
-    GTEST_FLAG(print_time) = print_time_;
-    GTEST_FLAG(print_utf8) = print_utf8_;
-    GTEST_FLAG(random_seed) = random_seed_;
-    GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(shuffle) = shuffle_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+    GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_);
+    GTEST_FLAG_SET(break_on_failure, break_on_failure_);
+    GTEST_FLAG_SET(catch_exceptions, catch_exceptions_);
+    GTEST_FLAG_SET(color, color_);
+    GTEST_FLAG_SET(death_test_style, death_test_style_);
+    GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_);
+    GTEST_FLAG_SET(filter, filter_);
+    GTEST_FLAG_SET(fail_fast, fail_fast_);
+    GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_);
+    GTEST_FLAG_SET(list_tests, list_tests_);
+    GTEST_FLAG_SET(output, output_);
+    GTEST_FLAG_SET(brief, brief_);
+    GTEST_FLAG_SET(print_time, print_time_);
+    GTEST_FLAG_SET(print_utf8, print_utf8_);
+    GTEST_FLAG_SET(random_seed, random_seed_);
+    GTEST_FLAG_SET(repeat, repeat_);
+    GTEST_FLAG_SET(recreate_environments_when_repeating,
+                   recreate_environments_when_repeating_);
+    GTEST_FLAG_SET(shuffle, shuffle_);
+    GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_);
+    GTEST_FLAG_SET(stream_result_to, stream_result_to_);
+    GTEST_FLAG_SET(throw_on_failure, throw_on_failure_);
   }
 
  private:
@@ -207,16 +197,19 @@ class GTestFlagSaver {
   std::string color_;
   std::string death_test_style_;
   bool death_test_use_fork_;
+  bool fail_fast_;
   std::string filter_;
   std::string internal_run_death_test_;
   bool list_tests_;
   std::string output_;
+  bool brief_;
   bool print_time_;
   bool print_utf8_;
-  internal::Int32 random_seed_;
-  internal::Int32 repeat_;
+  int32_t random_seed_;
+  int32_t repeat_;
+  bool recreate_environments_when_repeating_;
   bool shuffle_;
-  internal::Int32 stack_trace_depth_;
+  int32_t stack_trace_depth_;
   std::string stream_result_to_;
   bool throw_on_failure_;
 } GTEST_ATTRIBUTE_UNUSED_;
@@ -227,7 +220,7 @@ class GTestFlagSaver {
 // If the code_point is not a valid Unicode code point
 // (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
 // to "(Invalid Unicode 0xXXXXXXXX)".
-GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+GTEST_API_ std::string CodePointToUtf8(uint32_t code_point);
 
 // Converts a wide string to a narrow string in UTF-8 encoding.
 // The wide string is assumed to have the following encoding:
@@ -260,17 +253,17 @@ GTEST_API_ bool ShouldShard(const char* total_shards_str,
                             const char* shard_index_str,
                             bool in_subprocess_for_death_test);
 
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error and
+// Parses the environment variable var as a 32-bit integer. If it is unset,
+// returns default_val. If it is not a 32-bit integer, prints an error and
 // and aborts.
-GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val);
 
 // Given the total number of shards, the shard index, and the test id,
 // returns true if and only if the test should be run on this shard. The test id
 // is some arbitrary but unique non-negative integer assigned to each test
 // method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
+GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
+                                     int test_id);
 
 // STL container utilities.
 
@@ -281,9 +274,8 @@ inline int CountIf(const Container& c, Predicate predicate) {
   // Implemented as an explicit loop since std::count_if() in libCstd on
   // Solaris has a non-standard signature.
   int count = 0;
-  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
+  for (auto it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it)) ++count;
   }
   return count;
 }
@@ -323,7 +315,7 @@ void ShuffleRange(internal::Random* random, int begin, int end,
     const int last_in_range = begin + range_width - 1;
     const int selected =
         begin +
-        static_cast<int>(random->Generate(static_cast<UInt32>(range_width)));
+        static_cast<int>(random->Generate(static_cast<uint32_t>(range_width)));
     std::swap((*v)[static_cast<size_t>(selected)],
               (*v)[static_cast<size_t>(last_in_range)]);
   }
@@ -385,13 +377,6 @@ class GTEST_API_ UnitTestOptions {
 
   // Functions for processing the gtest_filter flag.
 
-  // Returns true if and only if the wildcard pattern matches the string.
-  // The first ':' or '\0' character in pattern marks the end of it.
-  //
-  // This recursive algorithm isn't very efficient, but is clear and
-  // works well enough for matching test names, which are short.
-  static bool PatternMatchesString(const char *pattern, const char *str);
-
   // Returns true if and only if the user-specified filter matches the test
   // suite name and the test name.
   static bool FilterMatchesTest(const std::string& test_suite_name,
@@ -439,7 +424,9 @@ class OsStackTraceGetterInterface {
   static const char* const kElidedFramesMarker;
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+  OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete;
+  OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) =
+      delete;
 };
 
 // A working implementation of the OsStackTraceGetterInterface interface.
@@ -461,7 +448,8 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface {
   void* caller_frame_ = nullptr;
 #endif  // GTEST_HAS_ABSL
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+  OsStackTraceGetter(const OsStackTraceGetter&) = delete;
+  OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete;
 };
 
 // Information about a Google Test trace point.
@@ -474,7 +462,7 @@ struct TraceInfo {
 // This is the default global test part result reporter used in UnitTestImpl.
 // This class should only be used by UnitTestImpl.
 class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
+    : public TestPartResultReporterInterface {
  public:
   explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
   // Implements the TestPartResultReporterInterface. Reports the test part
@@ -484,7 +472,10 @@ class DefaultGlobalTestPartResultReporter
  private:
   UnitTestImpl* const unit_test_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+  DefaultGlobalTestPartResultReporter(
+      const DefaultGlobalTestPartResultReporter&) = delete;
+  DefaultGlobalTestPartResultReporter& operator=(
+      const DefaultGlobalTestPartResultReporter&) = delete;
 };
 
 // This is the default per thread test part result reporter used in
@@ -500,7 +491,10 @@ class DefaultPerThreadTestPartResultReporter
  private:
   UnitTestImpl* const unit_test_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+  DefaultPerThreadTestPartResultReporter(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
+  DefaultPerThreadTestPartResultReporter& operator=(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
 };
 
 // The private implementation of the UnitTest class.  We don't protect
@@ -638,7 +632,8 @@ class GTEST_API_ UnitTestImpl {
   // For example, if Foo() calls Bar(), which in turn calls
   // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
   // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+  std::string CurrentOsStackTraceExceptTop(int skip_count)
+      GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_;
 
   // Finds and returns a TestSuite with the given name.  If one doesn't
   // exist, creates one and returns it.
@@ -646,10 +641,10 @@ class GTEST_API_ UnitTestImpl {
   // Arguments:
   //
   //   test_suite_name: name of the test suite
-  //   type_param:     the name of the test's type parameter, or NULL if
-  //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test suite
-  //   tear_down_tc:   pointer to the function that tears down the test suite
+  //   type_param:      the name of the test's type parameter, or NULL if
+  //                    this is not a typed or a type-parameterized test.
+  //   set_up_tc:       pointer to the function that sets up the test suite
+  //   tear_down_tc:    pointer to the function that tears down the test suite
   TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param,
                           internal::SetUpTestSuiteFunc set_up_tc,
                           internal::TearDownTestSuiteFunc tear_down_tc);
@@ -673,6 +668,7 @@ class GTEST_API_ UnitTestImpl {
   void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
                    internal::TearDownTestSuiteFunc tear_down_tc,
                    TestInfo* test_info) {
+#if GTEST_HAS_DEATH_TEST
     // In order to support thread-safe death tests, we need to
     // remember the original working directory when the test program
     // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
@@ -685,6 +681,7 @@ class GTEST_API_ UnitTestImpl {
       GTEST_CHECK_(!original_working_dir_.IsEmpty())
           << "Failed to get the current working directory.";
     }
+#endif  // GTEST_HAS_DEATH_TEST
 
     GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
                  set_up_tc, tear_down_tc)
@@ -697,6 +694,17 @@ class GTEST_API_ UnitTestImpl {
     return parameterized_test_registry_;
   }
 
+  std::set<std::string>* ignored_parameterized_test_suites() {
+    return &ignored_parameterized_test_suites_;
+  }
+
+  // Returns TypeParameterizedTestSuiteRegistry object used to keep track of
+  // type-parameterized tests and instantiations of them.
+  internal::TypeParameterizedTestSuiteRegistry&
+  type_parameterized_test_registry() {
+    return type_parameterized_test_registry_;
+  }
+
   // Sets the TestSuite object for the test that's currently running.
   void set_current_test_suite(TestSuite* a_current_test_suite) {
     current_test_suite_ = a_current_test_suite;
@@ -729,9 +737,7 @@ class GTEST_API_ UnitTestImpl {
   }
 
   // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
+  void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
 
   // Adds a TestProperty to the current TestResult object when invoked in a
   // context of a test or a test suite, or to the global property set. If the
@@ -739,10 +745,7 @@ class GTEST_API_ UnitTestImpl {
   // updated.
   void RecordProperty(const TestProperty& test_property);
 
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
+  enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
 
   // Matches the full name of each test against the user-specified
   // filter to decide whether the test should run, then records the
@@ -873,6 +876,12 @@ class GTEST_API_ UnitTestImpl {
   // ParameterizedTestRegistry object used to register value-parameterized
   // tests.
   internal::ParameterizedTestSuiteRegistry parameterized_test_registry_;
+  internal::TypeParameterizedTestSuiteRegistry
+      type_parameterized_test_registry_;
+
+  // The set holding the name of parameterized
+  // test suites that may go uninstantiated.
+  std::set<std::string> ignored_parameterized_test_suites_;
 
   // Indicates whether RegisterParameterizedTests() has been called already.
   bool parameterized_tests_registered_;
@@ -942,7 +951,8 @@ class GTEST_API_ UnitTestImpl {
   // starts.
   bool catch_exceptions_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+  UnitTestImpl(const UnitTestImpl&) = delete;
+  UnitTestImpl& operator=(const UnitTestImpl&) = delete;
 };  // class UnitTestImpl
 
 // Convenience function for accessing the global UnitTest
@@ -965,8 +975,9 @@ GTEST_API_ bool IsValidEscape(char ch);
 GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
 GTEST_API_ bool ValidateRegex(const char* regex);
 GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
+                                              char repeat, const char* regex,
+                                              const char* str);
 GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
 
 #endif  // GTEST_USES_SIMPLE_RE
@@ -999,20 +1010,9 @@ bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
   char* end;
   // BiggestConvertible is the largest integer type that system-provided
   // string-to-number conversion routines can return.
+  using BiggestConvertible = unsigned long long;  // NOLINT
 
-# if GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  // MSVC and C++ Builder define __int64 instead of the standard long long.
-  typedef unsigned __int64 BiggestConvertible;
-  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
-
-# else
-
-  typedef unsigned long long BiggestConvertible;  // NOLINT
-  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
-
-# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
-
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);  // NOLINT
   const bool parse_success = *end == '\0' && errno == 0;
 
   GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
@@ -1079,8 +1079,7 @@ class StreamingListener : public EmptyTestEventListener {
     }
 
     ~SocketWriter() override {
-      if (sockfd_ != -1)
-        CloseConnection();
+      if (sockfd_ != -1) CloseConnection();
     }
 
     // Sends a string to the socket.
@@ -1090,9 +1089,8 @@ class StreamingListener : public EmptyTestEventListener {
 
       const auto len = static_cast<size_t>(message.length());
       if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
-        GTEST_LOG_(WARNING)
-            << "stream_result_to: failed to stream to "
-            << host_name_ << ":" << port_num_;
+        GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
+                            << host_name_ << ":" << port_num_;
       }
     }
 
@@ -1113,7 +1111,8 @@ class StreamingListener : public EmptyTestEventListener {
     const std::string host_name_;
     const std::string port_num_;
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+    SocketWriter(const SocketWriter&) = delete;
+    SocketWriter& operator=(const SocketWriter&) = delete;
   };  // class SocketWriter
 
   // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
@@ -1125,7 +1124,9 @@ class StreamingListener : public EmptyTestEventListener {
   }
 
   explicit StreamingListener(AbstractSocketWriter* socket_writer)
-      : socket_writer_(socket_writer) { Start(); }
+      : socket_writer_(socket_writer) {
+    Start();
+  }
 
   void OnTestProgramStart(const UnitTest& /* unit_test */) override {
     SendLn("event=TestProgramStart");
@@ -1148,22 +1149,22 @@ class StreamingListener : public EmptyTestEventListener {
 
   void OnTestIterationEnd(const UnitTest& unit_test,
                           int /* iteration */) override {
-    SendLn("event=TestIterationEnd&passed=" +
-           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
-           StreamableToString(unit_test.elapsed_time()) + "ms");
+    SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
+           "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
+           "ms");
   }
 
   // Note that "event=TestCaseStart" is a wire format and has to remain
-  // "case" for compatibilty
-  void OnTestCaseStart(const TestCase& test_case) override {
-    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  // "case" for compatibility
+  void OnTestSuiteStart(const TestSuite& test_suite) override {
+    SendLn(std::string("event=TestCaseStart&name=") + test_suite.name());
   }
 
   // Note that "event=TestCaseEnd" is a wire format and has to remain
-  // "case" for compatibilty
-  void OnTestCaseEnd(const TestCase& test_case) override {
-    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
-           "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
+  // "case" for compatibility
+  void OnTestSuiteEnd(const TestSuite& test_suite) override {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) +
+           "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) +
            "ms");
   }
 
@@ -1173,8 +1174,7 @@ class StreamingListener : public EmptyTestEventListener {
 
   void OnTestEnd(const TestInfo& test_info) override {
     SendLn("event=TestEnd&passed=" +
-           FormatBool((test_info.result())->Passed()) +
-           "&elapsed_time=" +
+           FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
            StreamableToString((test_info.result())->elapsed_time()) + "ms");
   }
 
@@ -1198,7 +1198,8 @@ class StreamingListener : public EmptyTestEventListener {
 
   const std::unique_ptr<AbstractSocketWriter> socket_writer_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+  StreamingListener(const StreamingListener&) = delete;
+  StreamingListener& operator=(const StreamingListener&) = delete;
 };  // class StreamingListener
 
 #endif  // GTEST_CAN_STREAM_RESULTS_
@@ -1208,4 +1209,4 @@ class StreamingListener : public EmptyTestEventListener {
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+#endif  // GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/third_party/googletest/src/src/gtest-matchers.cc b/third_party/googletest/src/src/gtest-matchers.cc
index 7d2fb6851e..7e3bcc0cff 100644
--- a/third_party/googletest/src/src/gtest-matchers.cc
+++ b/third_party/googletest/src/src/gtest-matchers.cc
@@ -32,12 +32,13 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
 #include "gtest/gtest-matchers.h"
 
 #include <string>
 
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
 namespace testing {
 
 // Constructs a matcher that matches a const std::string& whose value is
@@ -58,40 +59,40 @@ Matcher<std::string>::Matcher(const std::string& s) { *this = Eq(s); }
 // s.
 Matcher<std::string>::Matcher(const char* s) { *this = Eq(std::string(s)); }
 
-#if GTEST_HAS_ABSL
-// Constructs a matcher that matches a const absl::string_view& whose value is
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Constructs a matcher that matches a const StringView& whose value is
 // equal to s.
-Matcher<const absl::string_view&>::Matcher(const std::string& s) {
+Matcher<const internal::StringView&>::Matcher(const std::string& s) {
   *this = Eq(s);
 }
 
-// Constructs a matcher that matches a const absl::string_view& whose value is
+// Constructs a matcher that matches a const StringView& whose value is
 // equal to s.
-Matcher<const absl::string_view&>::Matcher(const char* s) {
+Matcher<const internal::StringView&>::Matcher(const char* s) {
   *this = Eq(std::string(s));
 }
 
-// Constructs a matcher that matches a const absl::string_view& whose value is
+// Constructs a matcher that matches a const StringView& whose value is
 // equal to s.
-Matcher<const absl::string_view&>::Matcher(absl::string_view s) {
+Matcher<const internal::StringView&>::Matcher(internal::StringView s) {
   *this = Eq(std::string(s));
 }
 
-// Constructs a matcher that matches a absl::string_view whose value is equal to
+// Constructs a matcher that matches a StringView whose value is equal to
 // s.
-Matcher<absl::string_view>::Matcher(const std::string& s) { *this = Eq(s); }
+Matcher<internal::StringView>::Matcher(const std::string& s) { *this = Eq(s); }
 
-// Constructs a matcher that matches a absl::string_view whose value is equal to
+// Constructs a matcher that matches a StringView whose value is equal to
 // s.
-Matcher<absl::string_view>::Matcher(const char* s) {
+Matcher<internal::StringView>::Matcher(const char* s) {
   *this = Eq(std::string(s));
 }
 
-// Constructs a matcher that matches a absl::string_view whose value is equal to
+// Constructs a matcher that matches a StringView whose value is equal to
 // s.
-Matcher<absl::string_view>::Matcher(absl::string_view s) {
+Matcher<internal::StringView>::Matcher(internal::StringView s) {
   *this = Eq(std::string(s));
 }
-#endif  // GTEST_HAS_ABSL
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
 
 }  // namespace testing
diff --git a/third_party/googletest/src/src/gtest-port.cc b/third_party/googletest/src/src/gtest-port.cc
index fc5ba6becc..d797fe4d58 100644
--- a/third_party/googletest/src/src/gtest-port.cc
+++ b/third_party/googletest/src/src/gtest-port.cc
@@ -27,60 +27,62 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "gtest/internal/gtest-port.h"
 
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include <cstdint>
 #include <fstream>
 #include <memory>
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>
-# include <io.h>
-# include <sys/stat.h>
-# include <map>  // Used in ThreadLocal.
-# ifdef _MSC_VER
-#  include <crtdbg.h>
-# endif  // _MSC_VER
+#include <io.h>
+#include <sys/stat.h>
+#include <windows.h>
+
+#include <map>  // Used in ThreadLocal.
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif  // _MSC_VER
 #else
-# include <unistd.h>
+#include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
 #endif  // GTEST_OS_MAC
 
 #if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
     GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# include <sys/sysctl.h>
-# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
-#  include <sys/user.h>
-# endif
+#include <sys/sysctl.h>
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#include <sys/user.h>
+#endif
 #endif
 
 #if GTEST_OS_QNX
-# include <devctl.h>
-# include <fcntl.h>
-# include <sys/procfs.h>
+#include <devctl.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
 #endif  // GTEST_OS_QNX
 
 #if GTEST_OS_AIX
-# include <procinfo.h>
-# include <sys/types.h>
+#include <procinfo.h>
+#include <sys/types.h>
 #endif  // GTEST_OS_AIX
 
 #if GTEST_OS_FUCHSIA
-# include <zircon/process.h>
-# include <zircon/syscalls.h>
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
 #endif  // GTEST_OS_FUCHSIA
 
-#include "gtest/gtest-spi.h"
 #include "gtest/gtest-message.h"
+#include "gtest/gtest-spi.h"
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 #include "src/gtest-internal-inl.h"
@@ -88,16 +90,7 @@
 namespace testing {
 namespace internal {
 
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif  // _MSC_VER
-
-#if GTEST_OS_LINUX
+#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD
 
 namespace {
 template <typename T>
@@ -130,8 +123,7 @@ size_t GetThreadCount() {
   if (status == KERN_SUCCESS) {
     // task_threads allocates resources in thread_list and we need to free them
     // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
+    vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
                   sizeof(thread_t) * thread_count);
     return static_cast<size_t>(thread_count);
   } else {
@@ -140,7 +132,7 @@ size_t GetThreadCount() {
 }
 
 #elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
-      GTEST_OS_NETBSD
+    GTEST_OS_NETBSD
 
 #if GTEST_OS_NETBSD
 #undef KERN_PROC
@@ -183,12 +175,12 @@ size_t GetThreadCount() {
 // we cannot detect it.
 size_t GetThreadCount() {
   int mib[] = {
-    CTL_KERN,
-    KERN_PROC,
-    KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
-    getpid(),
-    sizeof(struct kinfo_proc),
-    0,
+      CTL_KERN,
+      KERN_PROC,
+      KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+      getpid(),
+      sizeof(struct kinfo_proc),
+      0,
   };
   u_int miblen = sizeof(mib) / sizeof(mib[0]);
 
@@ -197,7 +189,8 @@ size_t GetThreadCount() {
   if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
     return 0;
   }
-  mib[5] = size / mib[4];
+
+  mib[5] = static_cast<int>(size / static_cast<size_t>(mib[4]));
 
   // populate array of structs
   struct kinfo_proc info[mib[5]];
@@ -206,10 +199,9 @@ size_t GetThreadCount() {
   }
 
   // exclude empty members
-  int nthreads = 0;
-  for (int i = 0; i < size / mib[4]; i++) {
-    if (info[i].p_tid != -1)
-      nthreads++;
+  size_t nthreads = 0;
+  for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
+    if (info[i].p_tid != -1) nthreads++;
   }
   return nthreads;
 }
@@ -252,13 +244,9 @@ size_t GetThreadCount() {
 size_t GetThreadCount() {
   int dummy_buffer;
   size_t avail;
-  zx_status_t status = zx_object_get_info(
-      zx_process_self(),
-      ZX_INFO_PROCESS_THREADS,
-      &dummy_buffer,
-      0,
-      nullptr,
-      &avail);
+  zx_status_t status =
+      zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
+                         &dummy_buffer, 0, nullptr, &avail);
   if (status == ZX_OK) {
     return avail;
   } else {
@@ -278,27 +266,15 @@ size_t GetThreadCount() {
 
 #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
 
-void SleepMilliseconds(int n) {
-  ::Sleep(static_cast<DWORD>(n));
-}
-
-AutoHandle::AutoHandle()
-    : handle_(INVALID_HANDLE_VALUE) {}
+AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
 
-AutoHandle::AutoHandle(Handle handle)
-    : handle_(handle) {}
+AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
 
-AutoHandle::~AutoHandle() {
-  Reset();
-}
+AutoHandle::~AutoHandle() { Reset(); }
 
-AutoHandle::Handle AutoHandle::Get() const {
-  return handle_;
-}
+AutoHandle::Handle AutoHandle::Get() const { return handle_; }
 
-void AutoHandle::Reset() {
-  Reset(INVALID_HANDLE_VALUE);
-}
+void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
 
 void AutoHandle::Reset(HANDLE handle) {
   // Resetting with the same handle we already own is invalid.
@@ -310,7 +286,7 @@ void AutoHandle::Reset(HANDLE handle) {
   } else {
     GTEST_CHECK_(!IsCloseable())
         << "Resetting a valid handle to itself is likely a programmer error "
-            "and thus not allowed.";
+           "and thus not allowed.";
   }
 }
 
@@ -320,23 +296,6 @@ bool AutoHandle::IsCloseable() const {
   return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
 }
 
-Notification::Notification()
-    : event_(::CreateEvent(nullptr,     // Default security attributes.
-                           TRUE,        // Do not reset automatically.
-                           FALSE,       // Initially unset.
-                           nullptr)) {  // Anonymous event.
-  GTEST_CHECK_(event_.Get() != nullptr);
-}
-
-void Notification::Notify() {
-  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
-}
-
-void Notification::WaitForNotification() {
-  GTEST_CHECK_(
-      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
-}
-
 Mutex::Mutex()
     : owner_thread_id_(0),
       type_(kDynamic),
@@ -389,25 +348,25 @@ namespace {
 //    MemoryIsNotDeallocated memory_is_not_deallocated;
 //    critical_section_ = new CRITICAL_SECTION;
 //
-class MemoryIsNotDeallocated
-{
+class MemoryIsNotDeallocated {
  public:
   MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
     old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
     // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
     // doesn't report mem leak if there's no matching deallocation.
-    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
   }
 
   ~MemoryIsNotDeallocated() {
     // Restore the original _CRTDBG_ALLOC_MEM_DF flag
-    _CrtSetDbgFlag(old_crtdbg_flag_);
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_);
   }
 
  private:
   int old_crtdbg_flag_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+  MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete;
+  MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete;
 };
 #endif  // _MSC_VER
 
@@ -433,15 +392,13 @@ void Mutex::ThreadSafeLazyInit() {
         ::InitializeCriticalSection(critical_section_);
         // Updates the critical_section_init_phase_ to 2 to signal
         // initialization complete.
-        GTEST_CHECK_(::InterlockedCompareExchange(
-                          &critical_section_init_phase_, 2L, 1L) ==
-                      1L);
+        GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
+                                                  2L, 1L) == 1L);
         break;
       case 1:
         // Somebody else is already initializing the mutex; spin until they
         // are done.
-        while (::InterlockedCompareExchange(&critical_section_init_phase_,
-                                            2L,
+        while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
                                             2L) != 2L) {
           // Possibly yields the rest of the thread's time slice to other
           // threads.
@@ -486,9 +443,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
  private:
   struct ThreadMainParam {
     ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
-        : runnable_(runnable),
-          thread_can_start_(thread_can_start) {
-    }
+        : runnable_(runnable), thread_can_start_(thread_can_start) {}
     std::unique_ptr<Runnable> runnable_;
     // Does not own.
     Notification* thread_can_start_;
@@ -506,20 +461,18 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
   // Prohibit instantiation.
   ThreadWithParamSupport();
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+  ThreadWithParamSupport(const ThreadWithParamSupport&) = delete;
+  ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete;
 };
 
 }  // namespace
 
-ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable,
                                          Notification* thread_can_start)
-      : thread_(ThreadWithParamSupport::CreateThread(runnable,
-                                                     thread_can_start)) {
-}
+    : thread_(
+          ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
 
-ThreadWithParamBase::~ThreadWithParamBase() {
-  Join();
-}
+ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
 
 void ThreadWithParamBase::Join() {
   GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
@@ -536,6 +489,9 @@ class ThreadLocalRegistryImpl {
   // Returns a value that can be used to identify the thread from other threads.
   static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
       const ThreadLocalBase* thread_local_instance) {
+#ifdef _MSC_VER
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
     DWORD current_thread = ::GetCurrentThreadId();
     MutexLock lock(&mutex_);
     ThreadIdToThreadLocals* const thread_to_thread_locals =
@@ -543,8 +499,10 @@ class ThreadLocalRegistryImpl {
     ThreadIdToThreadLocals::iterator thread_local_pos =
         thread_to_thread_locals->find(current_thread);
     if (thread_local_pos == thread_to_thread_locals->end()) {
-      thread_local_pos = thread_to_thread_locals->insert(
-          std::make_pair(current_thread, ThreadLocalValues())).first;
+      thread_local_pos =
+          thread_to_thread_locals
+              ->insert(std::make_pair(current_thread, ThreadLocalValues()))
+              .first;
       StartWatcherThreadFor(current_thread);
     }
     ThreadLocalValues& thread_local_values = thread_local_pos->second;
@@ -572,9 +530,8 @@ class ThreadLocalRegistryImpl {
       ThreadIdToThreadLocals* const thread_to_thread_locals =
           GetThreadLocalsMapLocked();
       for (ThreadIdToThreadLocals::iterator it =
-          thread_to_thread_locals->begin();
-          it != thread_to_thread_locals->end();
-          ++it) {
+               thread_to_thread_locals->begin();
+           it != thread_to_thread_locals->end(); ++it) {
         ThreadLocalValues& thread_local_values = it->second;
         ThreadLocalValues::iterator value_pos =
             thread_local_values.find(thread_local_instance);
@@ -604,9 +561,8 @@ class ThreadLocalRegistryImpl {
       if (thread_local_pos != thread_to_thread_locals->end()) {
         ThreadLocalValues& thread_local_values = thread_local_pos->second;
         for (ThreadLocalValues::iterator value_pos =
-            thread_local_values.begin();
-            value_pos != thread_local_values.end();
-            ++value_pos) {
+                 thread_local_values.begin();
+             value_pos != thread_local_values.end(); ++value_pos) {
           value_holders.push_back(value_pos->second);
         }
         thread_to_thread_locals->erase(thread_local_pos);
@@ -632,9 +588,8 @@ class ThreadLocalRegistryImpl {
   static void StartWatcherThreadFor(DWORD thread_id) {
     // The returned handle will be kept in thread_map and closed by
     // watcher_thread in WatcherThreadFunc.
-    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
-                                 FALSE,
-                                 thread_id);
+    HANDLE thread =
+        ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
     GTEST_CHECK_(thread != nullptr);
     // We need to pass a valid thread ID pointer into CreateThread for it
     // to work correctly under Win98.
@@ -645,7 +600,8 @@ class ThreadLocalRegistryImpl {
         &ThreadLocalRegistryImpl::WatcherThreadFunc,
         reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
         CREATE_SUSPENDED, &watcher_thread_id);
-    GTEST_CHECK_(watcher_thread != nullptr);
+    GTEST_CHECK_(watcher_thread != nullptr)
+        << "CreateThread failed with error " << ::GetLastError() << ".";
     // Give the watcher thread the same priority as ours to avoid being
     // blocked by it.
     ::SetThreadPriority(watcher_thread,
@@ -659,8 +615,7 @@ class ThreadLocalRegistryImpl {
   static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
     const ThreadIdAndHandle* tah =
         reinterpret_cast<const ThreadIdAndHandle*>(param);
-    GTEST_CHECK_(
-        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
     OnThreadExit(tah->first);
     ::CloseHandle(tah->second);
     delete tah;
@@ -683,17 +638,18 @@ class ThreadLocalRegistryImpl {
   static Mutex thread_map_mutex_;
 };
 
-Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
-Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);  // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(
+    Mutex::kStaticMutex);  // NOLINT
 
 ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance) {
+    const ThreadLocalBase* thread_local_instance) {
   return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
       thread_local_instance);
 }
 
 void ThreadLocalRegistry::OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance) {
+    const ThreadLocalBase* thread_local_instance) {
   ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
 }
 
@@ -781,7 +737,7 @@ bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
 bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
 bool IsAsciiWordChar(char ch) {
   return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
+         ('0' <= ch && ch <= '9') || ch == '_';
 }
 
 // Returns true if and only if "\\c" is a supported escape sequence.
@@ -794,17 +750,28 @@ bool IsValidEscape(char c) {
 bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
   if (escaped) {  // "\\p" where p is pattern_char.
     switch (pattern_char) {
-      case 'd': return IsAsciiDigit(ch);
-      case 'D': return !IsAsciiDigit(ch);
-      case 'f': return ch == '\f';
-      case 'n': return ch == '\n';
-      case 'r': return ch == '\r';
-      case 's': return IsAsciiWhiteSpace(ch);
-      case 'S': return !IsAsciiWhiteSpace(ch);
-      case 't': return ch == '\t';
-      case 'v': return ch == '\v';
-      case 'w': return IsAsciiWordChar(ch);
-      case 'W': return !IsAsciiWordChar(ch);
+      case 'd':
+        return IsAsciiDigit(ch);
+      case 'D':
+        return !IsAsciiDigit(ch);
+      case 'f':
+        return ch == '\f';
+      case 'n':
+        return ch == '\n';
+      case 'r':
+        return ch == '\r';
+      case 's':
+        return IsAsciiWhiteSpace(ch);
+      case 'S':
+        return !IsAsciiWhiteSpace(ch);
+      case 't':
+        return ch == '\t';
+      case 'v':
+        return ch == '\v';
+      case 'w':
+        return IsAsciiWordChar(ch);
+      case 'W':
+        return !IsAsciiWordChar(ch);
     }
     return IsAsciiPunct(pattern_char) && pattern_char == ch;
   }
@@ -815,7 +782,8 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
 // Helper function used by ValidateRegex() to format error messages.
 static std::string FormatRegexSyntaxError(const char* regex, int index) {
   return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
+                    << " in simple regular expression \"" << regex << "\": ")
+      .GetString();
 }
 
 // Generates non-fatal failures and returns false if regex is invalid;
@@ -857,12 +825,12 @@ bool ValidateRegex(const char* regex) {
                       << "'$' can only appear at the end.";
         is_valid = false;
       } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' is unsupported.";
         is_valid = false;
       } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' can only follow a repeatable token.";
         is_valid = false;
       }
 
@@ -880,12 +848,10 @@ bool ValidateRegex(const char* regex) {
 // characters to be indexable by size_t, in which case the test will
 // probably time out anyway.  We are fine with this limitation as
 // std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
+bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
+                                   const char* regex, const char* str) {
   const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
+  const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
   // We cannot call numeric_limits::max() as it conflicts with the
   // max() macro on Windows.
 
@@ -898,8 +864,7 @@ bool MatchRepetitionAndRegexAtHead(
       // greedy match.
       return true;
     }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
   }
   return false;
 }
@@ -913,25 +878,23 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
 
   // "$" only matches the end of a string.  Note that regex being
   // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
+  if (*regex == '$') return *str == '\0';
 
   // Is the first thing in regex an escape sequence?
   const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
+  if (escaped) ++regex;
   if (IsRepeat(regex[1])) {
     // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
     // here's an indirect recursion.  It terminates as the regex gets
     // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
+    return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
+                                         str);
   } else {
     // regex isn't empty, isn't "$", and doesn't start with a
     // repetition.  We match the first atom of regex with the first
     // character of str and recurse.
     return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
+           MatchRegexAtHead(regex + 1, str + 1);
   }
 }
 
@@ -946,13 +909,11 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
 bool MatchRegexAnywhere(const char* regex, const char* str) {
   if (regex == nullptr || str == nullptr) return false;
 
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
+  if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
 
   // A successful match can be anywhere in str.
   do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
+    if (MatchRegexAtHead(regex, str)) return true;
   } while (*str++ != '\0');
   return false;
 }
@@ -1033,8 +994,8 @@ GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
 // FormatFileLocation in order to contrast the two functions.
 // Note that FormatCompilerIndependentFileLocation() does NOT append colon
 // to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line) {
   const std::string file_name(file == nullptr ? kUnknownFile : file);
 
   if (line < 0)
@@ -1045,12 +1006,13 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
 
 GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
     : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
+  const char* const marker = severity == GTEST_INFO      ? "[  INFO ]"
+                             : severity == GTEST_WARNING ? "[WARNING]"
+                             : severity == GTEST_ERROR   ? "[ ERROR ]"
+                                                         : "[ FATAL ]";
+  GetStream() << ::std::endl
+              << marker << " " << FormatFileLocation(file, line).c_str()
+              << ": ";
 }
 
 // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
@@ -1073,27 +1035,26 @@ class CapturedStream {
  public:
   // The ctor redirects the stream to a temporary file.
   explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-# if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+#if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = {'\0'};   // NOLINT
+    char temp_file_path[MAX_PATH + 1] = {'\0'};  // NOLINT
 
     ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
+    const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
                                             0,  // Generate unique file name.
                                             temp_file_path);
     GTEST_CHECK_(success != 0)
         << "Unable to create a temporary file in " << temp_dir_path;
     const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
+    GTEST_CHECK_(captured_fd != -1)
+        << "Unable to open temporary file " << temp_file_path;
     filename_ = temp_file_path;
-# else
+#else
     // There's no guarantee that a test has write access to the current
-    // directory, so we create the temporary file in the /tmp directory
-    // instead. We use /tmp on most systems, and /sdcard on Android.
-    // That's because Android doesn't have /tmp.
-#  if GTEST_OS_LINUX_ANDROID
+    // directory, so we create the temporary file in a temporary directory.
+    std::string name_template;
+
+#if GTEST_OS_LINUX_ANDROID
     // Note: Android applications are expected to call the framework's
     // Context.getExternalStorageDirectory() method through JNI to get
     // the location of the world-writable SD Card directory. However,
@@ -1105,26 +1066,53 @@ class CapturedStream {
     // The location /data/local/tmp is directly accessible from native code.
     // '/sdcard' and other variants cannot be relied on, as they are not
     // guaranteed to be mounted, or may have a delay in mounting.
-    char name_template[] = "/data/local/tmp/gtest_captured_stream.XXXXXX";
-#  else
-    char name_template[] = "/tmp/captured_stream.XXXXXX";
-#  endif  // GTEST_OS_LINUX_ANDROID
-    const int captured_fd = mkstemp(name_template);
+    name_template = "/data/local/tmp/";
+#elif GTEST_OS_IOS
+    char user_temp_dir[PATH_MAX + 1];
+
+    // Documented alternative to NSTemporaryDirectory() (for obtaining creating
+    // a temporary directory) at
+    // https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/RaceConditions.html#//apple_ref/doc/uid/TP40002585-SW10
+    //
+    // _CS_DARWIN_USER_TEMP_DIR (as well as _CS_DARWIN_USER_CACHE_DIR) is not
+    // documented in the confstr() man page at
+    // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/confstr.3.html#//apple_ref/doc/man/3/confstr
+    // but are still available, according to the WebKit patches at
+    // https://trac.webkit.org/changeset/262004/webkit
+    // https://trac.webkit.org/changeset/263705/webkit
+    //
+    // The confstr() implementation falls back to getenv("TMPDIR"). See
+    // https://opensource.apple.com/source/Libc/Libc-1439.100.3/gen/confstr.c.auto.html
+    ::confstr(_CS_DARWIN_USER_TEMP_DIR, user_temp_dir, sizeof(user_temp_dir));
+
+    name_template = user_temp_dir;
+    if (name_template.back() != GTEST_PATH_SEP_[0])
+      name_template.push_back(GTEST_PATH_SEP_[0]);
+#else
+    name_template = "/tmp/";
+#endif
+    name_template.append("gtest_captured_stream.XXXXXX");
+
+    // mkstemp() modifies the string bytes in place, and does not go beyond the
+    // string's length. This results in well-defined behavior in C++17.
+    //
+    // The const_cast is needed below C++17. The constraints on std::string
+    // implementations in C++11 and above make assumption behind the const_cast
+    // fairly safe.
+    const int captured_fd = ::mkstemp(const_cast<char*>(name_template.data()));
     if (captured_fd == -1) {
       GTEST_LOG_(WARNING)
           << "Failed to create tmp file " << name_template
           << " for test; does the test have access to the /tmp directory?";
     }
-    filename_ = name_template;
-# endif  // GTEST_OS_WINDOWS
+    filename_ = std::move(name_template);
+#endif  // GTEST_OS_WINDOWS
     fflush(nullptr);
     dup2(captured_fd, fd_);
     close(captured_fd);
   }
 
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
+  ~CapturedStream() { remove(filename_.c_str()); }
 
   std::string GetCapturedString() {
     if (uncaptured_fd_ != -1) {
@@ -1151,7 +1139,8 @@ class CapturedStream {
   // Name of the temporary file holding the stderr output.
   ::std::string filename_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+  CapturedStream(const CapturedStream&) = delete;
+  CapturedStream& operator=(const CapturedStream&) = delete;
 };
 
 GTEST_DISABLE_MSC_DEPRECATED_POP_()
@@ -1179,6 +1168,15 @@ static std::string GetCapturedStream(CapturedStream** captured_stream) {
   return content;
 }
 
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+
 // Starts capturing stdout.
 void CaptureStdout() {
   CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
@@ -1201,10 +1199,6 @@ std::string GetCapturedStderr() {
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
-
-
-
-
 size_t GetFileSize(FILE* file) {
   fseek(file, 0, SEEK_END);
   return static_cast<size_t>(ftell(file));
@@ -1222,7 +1216,8 @@ std::string ReadEntireFile(FILE* file) {
   // Keeps reading the file until we cannot read further or the
   // pre-determined file size is reached.
   do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_last_read =
+        fread(buffer + bytes_read, 1, file_size - bytes_read, file);
     bytes_read += bytes_last_read;
   } while (bytes_last_read > 0 && bytes_read < file_size);
 
@@ -1286,7 +1281,7 @@ static std::string FlagToEnvVar(const char* flag) {
 // Parses 'str' for a 32-bit signed integer.  If successful, writes
 // the result to *value and returns true; otherwise leaves *value
 // unchanged and returns false.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+bool ParseInt32(const Message& src_text, const char* str, int32_t* value) {
   // Parses the environment variable as a decimal integer.
   char* end = nullptr;
   const long long_value = strtol(str, &end, 10);  // NOLINT
@@ -1303,14 +1298,14 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
     return false;
   }
 
-  // Is the parsed value in the range of an Int32?
-  const Int32 result = static_cast<Int32>(long_value);
+  // Is the parsed value in the range of an int32_t?
+  const auto result = static_cast<int32_t>(long_value);
   if (long_value == LONG_MAX || long_value == LONG_MIN ||
       // The parsed value overflows as a long.  (strtol() returns
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
-      // The parsed value overflows as an Int32.
-      ) {
+      // The parsed value overflows as an int32_t.
+  ) {
     Message msg;
     msg << "WARNING: " << src_text
         << " is expected to be a 32-bit integer, but actually"
@@ -1342,7 +1337,7 @@ bool BoolFromGTestEnv(const char* flag, bool default_value) {
 // Reads and returns a 32-bit integer stored in the environment
 // variable corresponding to the given flag; if it isn't set or
 // doesn't represent a valid 32-bit integer, returns default_value.
-Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
 #if defined(GTEST_GET_INT32_FROM_ENV_)
   return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
 #else
@@ -1353,9 +1348,9 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
     return default_value;
   }
 
-  Int32 result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
+  int32_t result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
+                  &result)) {
     printf("The default value %s is used.\n",
            (Message() << default_value).GetString().c_str());
     fflush(stdout);
@@ -1374,7 +1369,7 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
 // not check that the flag is 'output'
 // In essence this checks an env variable called XML_OUTPUT_FILE
 // and if it is set we prepend "xml:" to its value, if it not set we return ""
-std::string OutputFlagAlsoCheckEnvVar(){
+std::string OutputFlagAlsoCheckEnvVar() {
   std::string default_value_for_output_flag = "";
   const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
   if (nullptr != xml_output_file_env) {
diff --git a/third_party/googletest/src/src/gtest-printers.cc b/third_party/googletest/src/src/gtest-printers.cc
index 3337be312e..f3976d230d 100644
--- a/third_party/googletest/src/src/gtest-printers.cc
+++ b/third_party/googletest/src/src/gtest-printers.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -42,11 +41,16 @@
 // defines Foo.
 
 #include "gtest/gtest-printers.h"
+
 #include <stdio.h>
+
 #include <cctype>
+#include <cstdint>
 #include <cwchar>
 #include <ostream>  // NOLINT
 #include <string>
+#include <type_traits>
+
 #include "gtest/internal/gtest-port.h"
 #include "src/gtest-internal-inl.h"
 
@@ -96,15 +100,25 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
     PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
     *os << " ... ";
     // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
     PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
   }
   *os << ">";
 }
 
+// Helpers for widening a character to char32_t. Since the standard does not
+// specify if char / wchar_t is signed or unsigned, it is important to first
+// convert it to the unsigned type of the same width before widening it to
+// char32_t.
+template <typename CharType>
+char32_t ToChar32(CharType in) {
+  return static_cast<char32_t>(
+      static_cast<typename std::make_unsigned<CharType>::type>(in));
+}
+
 }  // namespace
 
-namespace internal2 {
+namespace internal {
 
 // Delegates to PrintBytesInObjectToImpl() to print the bytes in the
 // given object.  The delegation simplifies the implementation, which
@@ -116,36 +130,25 @@ void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
   PrintBytesInObjectToImpl(obj_bytes, count, os);
 }
 
-}  // namespace internal2
-
-namespace internal {
-
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
 //   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
-  kAsIs,
-  kHexEscape,
-  kSpecialEscape
-};
+enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
 
 // Returns true if c is a printable ASCII character.  We test the
 // value of c directly instead of calling isprint(), which is buggy on
 // Windows Mobile.
-inline bool IsPrintableAscii(wchar_t c) {
-  return 0x20 <= c && c <= 0x7E;
-}
+inline bool IsPrintableAscii(char32_t c) { return 0x20 <= c && c <= 0x7E; }
 
-// Prints a wide or narrow char c as a character literal without the
-// quotes, escaping it when necessary; returns how c was formatted.
-// The template argument UnsignedChar is the unsigned version of Char,
-// which is the type of c.
-template <typename UnsignedChar, typename Char>
+// Prints c (of type char, char8_t, char16_t, char32_t, or wchar_t) as a
+// character literal without the quotes, escaping it when necessary; returns how
+// c was formatted.
+template <typename Char>
 static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
-  wchar_t w_c = static_cast<wchar_t>(c);
-  switch (w_c) {
+  const char32_t u_c = ToChar32(c);
+  switch (u_c) {
     case L'\0':
       *os << "\\0";
       break;
@@ -177,13 +180,12 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
       *os << "\\v";
       break;
     default:
-      if (IsPrintableAscii(w_c)) {
+      if (IsPrintableAscii(u_c)) {
         *os << static_cast<char>(c);
         return kAsIs;
       } else {
         ostream::fmtflags flags = os->flags();
-        *os << "\\x" << std::hex << std::uppercase
-            << static_cast<int>(static_cast<UnsignedChar>(c));
+        *os << "\\x" << std::hex << std::uppercase << static_cast<int>(u_c);
         os->flags(flags);
         return kHexEscape;
       }
@@ -191,9 +193,9 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
   return kSpecialEscape;
 }
 
-// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// Prints a char32_t c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
   switch (c) {
     case L'\'':
       *os << "'";
@@ -202,33 +204,60 @@ static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
       *os << "\\\"";
       return kSpecialEscape;
     default:
-      return PrintAsCharLiteralTo<wchar_t>(c, os);
+      return PrintAsCharLiteralTo(c, os);
   }
 }
 
+static const char* GetCharWidthPrefix(char) { return ""; }
+
+static const char* GetCharWidthPrefix(signed char) { return ""; }
+
+static const char* GetCharWidthPrefix(unsigned char) { return ""; }
+
+#ifdef __cpp_char8_t
+static const char* GetCharWidthPrefix(char8_t) { return "u8"; }
+#endif
+
+static const char* GetCharWidthPrefix(char16_t) { return "u"; }
+
+static const char* GetCharWidthPrefix(char32_t) { return "U"; }
+
+static const char* GetCharWidthPrefix(wchar_t) { return "L"; }
+
 // Prints a char c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
 static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
-  return PrintAsStringLiteralTo(
-      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+#ifdef __cpp_char8_t
+static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+#endif
+
+static CharFormat PrintAsStringLiteralTo(char16_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
 }
 
-// Prints a wide or narrow character c and its code.  '\0' is printed
-// as "'\\0'", other unprintable characters are also properly escaped
-// using the standard C++ escape sequence.  The template argument
-// UnsignedChar is the unsigned version of Char, which is the type of c.
-template <typename UnsignedChar, typename Char>
+// Prints a character c (of type char, char8_t, char16_t, char32_t, or wchar_t)
+// and its code. '\0' is printed as "'\\0'", other unprintable characters are
+// also properly escaped using the standard C++ escape sequence.
+template <typename Char>
 void PrintCharAndCodeTo(Char c, ostream* os) {
   // First, print c as a literal in the most readable form we can find.
-  *os << ((sizeof(c) > 1) ? "L'" : "'");
-  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << GetCharWidthPrefix(c) << "'";
+  const CharFormat format = PrintAsCharLiteralTo(c, os);
   *os << "'";
 
   // To aid user debugging, we also print c's code in decimal, unless
   // it's 0 (in which case c was printed as '\\0', making the code
   // obvious).
-  if (c == 0)
-    return;
+  if (c == 0) return;
   *os << " (" << static_cast<int>(c);
 
   // For more convenience, we print c's code again in hexadecimal,
@@ -242,32 +271,75 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
   *os << ")";
 }
 
-void PrintTo(unsigned char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-void PrintTo(signed char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
+void PrintTo(unsigned char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
+void PrintTo(signed char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
 
 // Prints a wchar_t as a symbol if it is printable or as its internal
 // code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
-void PrintTo(wchar_t wc, ostream* os) {
-  PrintCharAndCodeTo<wchar_t>(wc, os);
+void PrintTo(wchar_t wc, ostream* os) { PrintCharAndCodeTo(wc, os); }
+
+// TODO(dcheng): Consider making this delegate to PrintCharAndCodeTo() as well.
+void PrintTo(char32_t c, ::std::ostream* os) {
+  *os << std::hex << "U+" << std::uppercase << std::setfill('0') << std::setw(4)
+      << static_cast<uint32_t>(c);
 }
 
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+void PrintTo(__uint128_t v, ::std::ostream* os) {
+  if (v == 0) {
+    *os << "0";
+    return;
+  }
+
+  // Buffer large enough for ceil(log10(2^128))==39 and the null terminator
+  char buf[40];
+  char* p = buf + sizeof(buf);
+
+  // Some configurations have a __uint128_t, but no support for built in
+  // division. Do manual long division instead.
+
+  uint64_t high = static_cast<uint64_t>(v >> 64);
+  uint64_t low = static_cast<uint64_t>(v);
+
+  *--p = 0;
+  while (high != 0 || low != 0) {
+    uint64_t high_mod = high % 10;
+    high = high / 10;
+    // This is the long division algorithm specialized for a divisor of 10 and
+    // only two elements.
+    // Notable values:
+    //   2^64 / 10 == 1844674407370955161
+    //   2^64 % 10 == 6
+    const uint64_t carry = 6 * high_mod + low % 10;
+    low = low / 10 + high_mod * 1844674407370955161 + carry / 10;
+
+    char digit = static_cast<char>(carry % 10);
+    *--p = '0' + digit;
+  }
+  *os << p;
+}
+void PrintTo(__int128_t v, ::std::ostream* os) {
+  __uint128_t uv = static_cast<__uint128_t>(v);
+  if (v < 0) {
+    *os << "-";
+    uv = -uv;
+  }
+  PrintTo(uv, os);
+}
+#endif  // __SIZEOF_INT128__
+
 // Prints the given array of characters to the ostream.  CharType must be either
-// char or wchar_t.
+// char, char8_t, char16_t, char32_t, or wchar_t.
 // The array starts at begin, the length is len, it may include '\0' characters
 // and may not be NUL-terminated.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static CharFormat PrintCharsAsStringTo(
-    const CharType* begin, size_t len, ostream* os) {
-  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
-  *os << kQuoteBegin;
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
+        PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) {
+  const char* const quote_prefix = GetCharWidthPrefix(*begin);
+  *os << quote_prefix << "\"";
   bool is_previous_hex = false;
   CharFormat print_format = kAsIs;
   for (size_t index = 0; index < len; ++index) {
@@ -276,7 +348,7 @@ static CharFormat PrintCharsAsStringTo(
       // Previous character is of '\x..' form and this character can be
       // interpreted as another hexadecimal digit in its number. Break string to
       // disambiguate.
-      *os << "\" " << kQuoteBegin;
+      *os << "\" " << quote_prefix << "\"";
     }
     is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
     // Remember if any characters required hex escaping.
@@ -291,12 +363,11 @@ static CharFormat PrintCharsAsStringTo(
 // Prints a (const) char/wchar_t array of 'len' elements, starting at address
 // 'begin'.  CharType must be either char or wchar_t.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void UniversalPrintCharArray(
-    const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
+        UniversalPrintCharArray(const CharType* begin, size_t len,
+                                ostream* os) {
   // The code
   //   const char kFoo[] = "foo";
   // generates an array of 4, not 3, elements, with the last one being '\0'.
@@ -322,22 +393,57 @@ void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
+#ifdef __cpp_char8_t
+// Prints a (const) char8_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+#endif
+
+// Prints a (const) char16_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char16_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) char32_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char32_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
 // Prints a (const) wchar_t array of 'len' elements, starting at address
 // 'begin'.
 void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
-// Prints the given C string to the ostream.
-void PrintTo(const char* s, ostream* os) {
+namespace {
+
+// Prints a null-terminated C-style string to the ostream.
+template <typename Char>
+void PrintCStringTo(const Char* s, ostream* os) {
   if (s == nullptr) {
     *os << "NULL";
   } else {
     *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, strlen(s), os);
+    PrintCharsAsStringTo(s, std::char_traits<Char>::length(s), os);
   }
 }
 
+}  // anonymous namespace
+
+void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); }
+
+#ifdef __cpp_char8_t
+void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); }
+#endif
+
+void PrintTo(const char16_t* s, ostream* os) { PrintCStringTo(s, os); }
+
+void PrintTo(const char32_t* s, ostream* os) { PrintCStringTo(s, os); }
+
 // MSVC compiler can be configured to define whar_t as a typedef
 // of unsigned short. Defining an overload for const wchar_t* in that case
 // would cause pointers to unsigned shorts be printed as wide strings,
@@ -346,41 +452,34 @@ void PrintTo(const char* s, ostream* os) {
 // wchar_t is implemented as a native type.
 #if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
 // Prints the given wide C string to the ostream.
-void PrintTo(const wchar_t* s, ostream* os) {
-  if (s == nullptr) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, wcslen(s), os);
-  }
-}
+void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
 #endif  // wchar_t is native
 
 namespace {
 
 bool ContainsUnprintableControlCodes(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
 
   for (size_t i = 0; i < length; i++) {
     unsigned char ch = *s++;
     if (std::iscntrl(ch)) {
-        switch (ch) {
+      switch (ch) {
         case '\t':
         case '\n':
         case '\r':
           break;
         default:
           return true;
-        }
       }
+    }
   }
   return false;
 }
 
-bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
 
 bool IsValidUTF8(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
 
   for (size_t i = 0; i < length;) {
     unsigned char lead = s[i++];
@@ -393,15 +492,13 @@ bool IsValidUTF8(const char* str, size_t length) {
     } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
       ++i;  // 2-byte character
     } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
                // check for non-shortest form and surrogate
                (lead != 0xe0 || s[i] >= 0xa0) &&
                (lead != 0xed || s[i] < 0xa0)) {
       i += 2;  // 3-byte character
     } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
                IsUTF8TrailByte(s[i + 2]) &&
                // check for non-shortest form
                (lead != 0xf0 || s[i] >= 0x90) &&
@@ -425,12 +522,26 @@ void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
 
 void PrintStringTo(const ::std::string& s, ostream* os) {
   if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
-    if (GTEST_FLAG(print_utf8)) {
+    if (GTEST_FLAG_GET(print_utf8)) {
       ConditionalPrintAsText(s.data(), s.size(), os);
     }
   }
 }
 
+#ifdef __cpp_char8_t
+void PrintU8StringTo(const ::std::u8string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif
+
+void PrintU16StringTo(const ::std::u16string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+void PrintU32StringTo(const ::std::u32string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
 #if GTEST_HAS_STD_WSTRING
 void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
   PrintCharsAsStringTo(s.data(), s.size(), os);
diff --git a/third_party/googletest/src/src/gtest-test-part.cc b/third_party/googletest/src/src/gtest-test-part.cc
index 178317a6bc..eb7c8d1cf9 100644
--- a/third_party/googletest/src/src/gtest-test-part.cc
+++ b/third_party/googletest/src/src/gtest-test-part.cc
@@ -31,6 +31,8 @@
 // The Google C++ Testing and Mocking Framework (Google Test)
 
 #include "gtest/gtest-test-part.h"
+
+#include "gtest/internal/gtest-port.h"
 #include "src/gtest-internal-inl.h"
 
 namespace testing {
@@ -46,14 +48,14 @@ std::string TestPartResult::ExtractSummary(const char* message) {
 
 // Prints a TestPartResult object.
 std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
-  return os << result.file_name() << ":" << result.line_number() << ": "
-            << (result.type() == TestPartResult::kSuccess
-                    ? "Success"
-                    : result.type() == TestPartResult::kSkip
-                          ? "Skipped"
-                          : result.type() == TestPartResult::kFatalFailure
-                                ? "Fatal failure"
-                                : "Non-fatal failure")
+  return os << internal::FormatFileLocation(result.file_name(),
+                                            result.line_number())
+            << " "
+            << (result.type() == TestPartResult::kSuccess ? "Success"
+                : result.type() == TestPartResult::kSkip  ? "Skipped"
+                : result.type() == TestPartResult::kFatalFailure
+                    ? "Fatal failure"
+                    : "Non-fatal failure")
             << ":\n"
             << result.message() << std::endl;
 }
@@ -82,8 +84,8 @@ namespace internal {
 
 HasNewFatalFailureHelper::HasNewFatalFailureHelper()
     : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
+      original_reporter_(
+          GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
   GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
 }
 
@@ -94,8 +96,7 @@ HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
 
 void HasNewFatalFailureHelper::ReportTestPartResult(
     const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
+  if (result.fatally_failed()) has_new_fatal_failure_ = true;
   original_reporter_->ReportTestPartResult(result);
 }
 
diff --git a/third_party/googletest/src/src/gtest-typed-test.cc b/third_party/googletest/src/src/gtest-typed-test.cc
index 8677caf732..a2828b83c6 100644
--- a/third_party/googletest/src/src/gtest-typed-test.cc
+++ b/third_party/googletest/src/src/gtest-typed-test.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "gtest/gtest-typed-test.h"
 
 #include "gtest/gtest.h"
@@ -35,13 +34,10 @@
 namespace testing {
 namespace internal {
 
-#if GTEST_HAS_TYPED_TEST_P
-
 // Skips to the first non-space char in str. Returns an empty string if str
 // contains only whitespace characters.
 static const char* SkipSpaces(const char* str) {
-  while (IsSpace(*str))
-    str++;
+  while (IsSpace(*str)) str++;
   return str;
 }
 
@@ -58,7 +54,10 @@ static std::vector<std::string> SplitIntoTestNames(const char* src) {
 // registered_tests_; returns registered_tests if successful, or
 // aborts the program otherwise.
 const char* TypedTestSuitePState::VerifyRegisteredTestNames(
-    const char* file, int line, const char* registered_tests) {
+    const char* test_suite_name, const char* file, int line,
+    const char* registered_tests) {
+  RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line));
+
   typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
   registered_ = true;
 
@@ -75,17 +74,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames(
       continue;
     }
 
-    bool found = false;
-    for (RegisteredTestIter it = registered_tests_.begin();
-         it != registered_tests_.end();
-         ++it) {
-      if (name == it->first) {
-        found = true;
-        break;
-      }
-    }
-
-    if (found) {
+    if (registered_tests_.count(name) != 0) {
       tests.insert(name);
     } else {
       errors << "No test named " << name
@@ -94,8 +83,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames(
   }
 
   for (RegisteredTestIter it = registered_tests_.begin();
-       it != registered_tests_.end();
-       ++it) {
+       it != registered_tests_.end(); ++it) {
     if (tests.count(it->first) == 0) {
       errors << "You forgot to list test " << it->first << ".\n";
     }
@@ -112,7 +100,5 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames(
   return registered_tests;
 }
 
-#endif  // GTEST_HAS_TYPED_TEST_P
-
 }  // namespace internal
 }  // namespace testing
diff --git a/third_party/googletest/src/src/gtest.cc b/third_party/googletest/src/src/gtest.cc
index a5b4e5ac78..6f31dd2260 100644
--- a/third_party/googletest/src/src/gtest.cc
+++ b/third_party/googletest/src/src/gtest.cc
@@ -31,11 +31,8 @@
 // The Google C++ Testing and Mocking Framework (Google Test)
 
 #include "gtest/gtest.h"
-#include "gtest/internal/custom/gtest.h"
-#include "gtest/gtest-spi.h"
 
 #include <ctype.h>
-#include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -44,85 +41,90 @@
 #include <wctype.h>
 
 #include <algorithm>
+#include <chrono>  // NOLINT
+#include <cmath>
+#include <cstdint>
+#include <initializer_list>
 #include <iomanip>
+#include <iterator>
 #include <limits>
 #include <list>
 #include <map>
 #include <ostream>  // NOLINT
 #include <sstream>
+#include <unordered_set>
 #include <vector>
 
-#if GTEST_OS_LINUX
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/custom/gtest.h"
 
-# define GTEST_HAS_GETTIMEOFDAY_ 1
+#if GTEST_OS_LINUX
 
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
+#include <fcntl.h>   // NOLINT
+#include <limits.h>  // NOLINT
+#include <sched.h>   // NOLINT
 // Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
+#include <strings.h>   // NOLINT
+#include <sys/mman.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
+
+#include <string>
 
 #elif GTEST_OS_ZOS
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
 
 // On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
+#include <strings.h>   // NOLINT
 
 #elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
 
-# include <windows.h>  // NOLINT
-# undef min
+#include <windows.h>  // NOLINT
+#undef min
 
 #elif GTEST_OS_WINDOWS  // We are on Windows proper.
 
-# include <windows.h>  // NOLINT
-# undef min
+#include <windows.h>  // NOLINT
+#undef min
 
-# include <crtdbg.h>  // NOLINT
-# include <debugapi.h>  // NOLINT
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
+#ifdef _MSC_VER
+#include <crtdbg.h>  // NOLINT
+#endif
 
-# if GTEST_OS_WINDOWS_MINGW
-// MinGW has gettimeofday() but not _ftime64().
-#  define GTEST_HAS_GETTIMEOFDAY_ 1
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
+#include <io.h>         // NOLINT
+#include <sys/stat.h>   // NOLINT
+#include <sys/timeb.h>  // NOLINT
+#include <sys/types.h>  // NOLINT
 
-#else
+#if GTEST_OS_WINDOWS_MINGW
+#include <sys/time.h>  // NOLINT
+#endif                 // GTEST_OS_WINDOWS_MINGW
 
-// Assume other platforms have gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
+#else
 
 // cpplint thinks that the header is already included, so we want to
 // silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
 
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-# include <sys/socket.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
+#include <arpa/inet.h>   // NOLINT
+#include <netdb.h>       // NOLINT
+#include <sys/socket.h>  // NOLINT
+#include <sys/types.h>   // NOLINT
 #endif
 
 #include "src/gtest-internal-inl.h"
 
 #if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
+#define vsnprintf _vsnprintf
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
@@ -135,7 +137,10 @@
 #include "absl/debugging/failure_signal_handler.h"
 #include "absl/debugging/stacktrace.h"
 #include "absl/debugging/symbolize.h"
+#include "absl/flags/parse.h"
+#include "absl/flags/usage.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
 #endif  // GTEST_HAS_ABSL
 
 namespace testing {
@@ -181,7 +186,7 @@ const char kStackTraceMarker[] = "\nStack trace:\n";
 // is specified on the command line.
 bool g_help_flag = false;
 
-// Utilty function to Open File for Writing
+// Utility function to Open File for Writing
 static FILE* OpenFileForWriting(const std::string& output_file) {
   FILE* fileout = nullptr;
   FilePath output_file_path(output_file);
@@ -209,24 +214,44 @@ static const char* GetDefaultFilter() {
   return kUniversalFilter;
 }
 
+// Bazel passes in the argument to '--test_runner_fail_fast' via the
+// TESTBRIDGE_TEST_RUNNER_FAIL_FAST environment variable.
+static bool GetDefaultFailFast() {
+  const char* const testbridge_test_runner_fail_fast =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST");
+  if (testbridge_test_runner_fail_fast != nullptr) {
+    return strcmp(testbridge_test_runner_fail_fast, "1") == 0;
+  }
+  return false;
+}
+
+}  // namespace testing
+
+GTEST_DEFINE_bool_(
+    fail_fast,
+    testing::internal::BoolFromGTestEnv("fail_fast",
+                                        testing::GetDefaultFailFast()),
+    "True if and only if a test failure should stop further test execution.");
+
 GTEST_DEFINE_bool_(
     also_run_disabled_tests,
-    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false),
     "Run disabled tests too, in addition to the tests normally being run.");
 
 GTEST_DEFINE_bool_(
-    break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false),
+    break_on_failure,
+    testing::internal::BoolFromGTestEnv("break_on_failure", false),
     "True if and only if a failed assertion should be a debugger "
     "break-point.");
 
 GTEST_DEFINE_bool_(catch_exceptions,
-                   internal::BoolFromGTestEnv("catch_exceptions", true),
+                   testing::internal::BoolFromGTestEnv("catch_exceptions",
+                                                       true),
                    "True if and only if " GTEST_NAME_
                    " should catch exceptions and treat them as test failures.");
 
 GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
+    color, testing::internal::StringFromGTestEnv("color", "auto"),
     "Whether to use colors in the output.  Valid values: yes, no, "
     "and auto.  'auto' means to use colors if the output is "
     "being sent to a terminal and the TERM environment variable "
@@ -234,7 +259,8 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_string_(
     filter,
-    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    testing::internal::StringFromGTestEnv("filter",
+                                          testing::GetDefaultFilter()),
     "A colon-separated list of glob (not regex) patterns "
     "for filtering the tests to run, optionally followed by a "
     "'-' and a : separated list of negative patterns (tests to "
@@ -243,13 +269,14 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_bool_(
     install_failure_signal_handler,
-    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
-    "If true and supported on the current platform, " GTEST_NAME_ " should "
+    testing::internal::BoolFromGTestEnv("install_failure_signal_handler",
+                                        false),
+    "If true and supported on the current platform, " GTEST_NAME_
+    " should "
     "install a signal handler that dumps debugging information when fatal "
     "signals are raised.");
 
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
+GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
 
 // The net priority order after flag processing is thus:
 //   --gtest_output command line flag
@@ -258,8 +285,8 @@ GTEST_DEFINE_bool_(list_tests, false,
 //   ''
 GTEST_DEFINE_string_(
     output,
-    internal::StringFromGTestEnv("output",
-      internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    testing::internal::StringFromGTestEnv(
+        "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()),
     "A format (defaults to \"xml\" but can be specified to be \"json\"), "
     "optionally followed by a colon and an output file name or directory. "
     "A directory is indicated by a trailing pathname separator. "
@@ -269,74 +296,91 @@ GTEST_DEFINE_string_(
     "executable's name and, if necessary, made unique by adding "
     "digits.");
 
-GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
+GTEST_DEFINE_bool_(
+    brief, testing::internal::BoolFromGTestEnv("brief", false),
+    "True if only test failures should be displayed in text output.");
+
+GTEST_DEFINE_bool_(print_time,
+                   testing::internal::BoolFromGTestEnv("print_time", true),
                    "True if and only if " GTEST_NAME_
                    " should display elapsed time in text output.");
 
-GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true),
+GTEST_DEFINE_bool_(print_utf8,
+                   testing::internal::BoolFromGTestEnv("print_utf8", true),
                    "True if and only if " GTEST_NAME_
                    " prints UTF8 characters as text.");
 
 GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
+    random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0),
     "Random number seed to use when shuffling test orders.  Must be in range "
     "[1, 99999], or 0 to use a seed based on the current time.");
 
 GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
+    repeat, testing::internal::Int32FromGTestEnv("repeat", 1),
     "How many times to repeat each test.  Specify a negative number "
     "for repeating forever.  Useful for shaking out flaky tests.");
 
+GTEST_DEFINE_bool_(
+    recreate_environments_when_repeating,
+    testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating",
+                                        false),
+    "Controls whether global test environments are recreated for each repeat "
+    "of the tests. If set to false the global test environments are only set "
+    "up once, for the first iteration, and only torn down once, for the last. "
+    "Useful for shaking out flaky tests with stable, expensive test "
+    "environments. If --gtest_repeat is set to a negative number, meaning "
+    "there is no last run, the environments will always be recreated to avoid "
+    "leaks.");
+
 GTEST_DEFINE_bool_(show_internal_stack_frames, false,
                    "True if and only if " GTEST_NAME_
                    " should include internal stack frames when "
                    "printing test failure stack traces.");
 
-GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false),
+GTEST_DEFINE_bool_(shuffle,
+                   testing::internal::BoolFromGTestEnv("shuffle", false),
                    "True if and only if " GTEST_NAME_
                    " should randomize tests' order on every run.");
 
 GTEST_DEFINE_int32_(
     stack_trace_depth,
-    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    testing::internal::Int32FromGTestEnv("stack_trace_depth",
+                                         testing::kMaxStackTraceDepth),
     "The maximum number of stack frames to print when an "
     "assertion fails.  The valid range is 0 through 100, inclusive.");
 
 GTEST_DEFINE_string_(
     stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
+    testing::internal::StringFromGTestEnv("stream_result_to", ""),
     "This flag specifies the host name and the port number on which to stream "
     "test results. Example: \"localhost:555\". The flag is effective only on "
     "Linux.");
 
 GTEST_DEFINE_bool_(
     throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
+    testing::internal::BoolFromGTestEnv("throw_on_failure", false),
     "When this flag is specified, a failed assertion will throw an exception "
     "if exceptions are enabled or exit the program with a non-zero code "
     "otherwise. For use with an external test framework.");
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 GTEST_DEFINE_string_(
-    flagfile,
-    internal::StringFromGTestEnv("flagfile", ""),
+    flagfile, testing::internal::StringFromGTestEnv("flagfile", ""),
     "This flag specifies the flagfile to read command-line flags from.");
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
+namespace testing {
 namespace internal {
 
 // Generates a random number from [0, range), using a Linear
 // Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
 // than kMaxRange.
-UInt32 Random::Generate(UInt32 range) {
+uint32_t Random::Generate(uint32_t range) {
   // These constants are the same as are used in glibc's rand(3).
   // Use wider types than necessary to prevent unsigned overflow diagnostics.
-  state_ = static_cast<UInt32>(1103515245ULL*state_ + 12345U) % kMaxRange;
+  state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
 
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
   GTEST_CHECK_(range <= kMaxRange)
       << "Generation of a number in [0, " << range << ") was requested, "
       << "but this can only generate numbers in [0, " << kMaxRange << ").";
@@ -381,26 +425,175 @@ static bool ShouldRunTestSuite(const TestSuite* test_suite) {
 }
 
 // AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
+AssertHelper::AssertHelper(TestPartResult::Type type, const char* file,
+                           int line, const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {}
 
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
+AssertHelper::~AssertHelper() { delete data_; }
 
 // Message assignment, for assertion streaming support.
 void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
+  UnitTest::GetInstance()->AddTestPartResult(
+      data_->type, data_->file, data_->line,
+      AppendUserMessage(data_->message, message),
+      UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
+      // Skips the stack frame for this function itself.
+  );  // NOLINT
+}
+
+namespace {
+
+// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P
+// to creates test cases for it, a synthetic test case is
+// inserted to report ether an error or a log message.
+//
+// This configuration bit will likely be removed at some point.
+constexpr bool kErrorOnUninstantiatedParameterizedTest = true;
+constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = true;
+
+// A test that fails at a given file/line location with a given message.
+class FailureTest : public Test {
+ public:
+  explicit FailureTest(const CodeLocation& loc, std::string error_message,
+                       bool as_error)
+      : loc_(loc),
+        error_message_(std::move(error_message)),
+        as_error_(as_error) {}
+
+  void TestBody() override {
+    if (as_error_) {
+      AssertHelper(TestPartResult::kNonFatalFailure, loc_.file.c_str(),
+                   loc_.line, "") = Message() << error_message_;
+    } else {
+      std::cout << error_message_ << std::endl;
+    }
+  }
+
+ private:
+  const CodeLocation loc_;
+  const std::string error_message_;
+  const bool as_error_;
+};
+
+}  // namespace
+
+std::set<std::string>* GetIgnoredParameterizedTestSuites() {
+  return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites();
+}
+
+// Add a given test_suit to the list of them allow to go un-instantiated.
+MarkAsIgnored::MarkAsIgnored(const char* test_suite) {
+  GetIgnoredParameterizedTestSuites()->insert(test_suite);
+}
+
+// If this parameterized test suite has no instantiations (and that
+// has not been marked as okay), emit a test case reporting that.
+void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
+                             bool has_test_p) {
+  const auto& ignored = *GetIgnoredParameterizedTestSuites();
+  if (ignored.find(name) != ignored.end()) return;
+
+  const char kMissingInstantiation[] =  //
+      " is defined via TEST_P, but never instantiated. None of the test cases "
+      "will run. Either no INSTANTIATE_TEST_SUITE_P is provided or the only "
+      "ones provided expand to nothing."
+      "\n\n"
+      "Ideally, TEST_P definitions should only ever be included as part of "
+      "binaries that intend to use them. (As opposed to, for example, being "
+      "placed in a library that may be linked in to get other utilities.)";
+
+  const char kMissingTestCase[] =  //
+      " is instantiated via INSTANTIATE_TEST_SUITE_P, but no tests are "
+      "defined via TEST_P . No test cases will run."
+      "\n\n"
+      "Ideally, INSTANTIATE_TEST_SUITE_P should only ever be invoked from "
+      "code that always depend on code that provides TEST_P. Failing to do "
+      "so is often an indication of dead code, e.g. the last TEST_P was "
+      "removed but the rest got left behind.";
+
+  std::string message =
+      "Parameterized test suite " + name +
+      (has_test_p ? kMissingInstantiation : kMissingTestCase) +
+      "\n\n"
+      "To suppress this error for this test suite, insert the following line "
+      "(in a non-header) in the namespace it is defined in:"
+      "\n\n"
+      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+      name + ");";
+
+  std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
+  RegisterTest(  //
+      "GoogleTestVerification", full_name.c_str(),
+      nullptr,  // No type parameter.
+      nullptr,  // No value parameter.
+      location.file.c_str(), location.line, [message, location] {
+        return new FailureTest(location, message,
+                               kErrorOnUninstantiatedParameterizedTest);
+      });
+}
+
+void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
+                                        CodeLocation code_location) {
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite(
+      test_suite_name, code_location);
+}
+
+void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) {
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
+      case_name);
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
+    const char* test_suite_name, CodeLocation code_location) {
+  suites_.emplace(std::string(test_suite_name),
+                  TypeParameterizedTestSuiteInfo(code_location));
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
+    const char* test_suite_name) {
+  auto it = suites_.find(std::string(test_suite_name));
+  if (it != suites_.end()) {
+    it->second.instantiated = true;
+  } else {
+    GTEST_LOG_(ERROR) << "Unknown type parameterized test suit '"
+                      << test_suite_name << "'";
+  }
+}
+
+void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
+  const auto& ignored = *GetIgnoredParameterizedTestSuites();
+  for (const auto& testcase : suites_) {
+    if (testcase.second.instantiated) continue;
+    if (ignored.find(testcase.first) != ignored.end()) continue;
+
+    std::string message =
+        "Type parameterized test suite " + testcase.first +
+        " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated "
+        "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run."
+        "\n\n"
+        "Ideally, TYPED_TEST_P definitions should only ever be included as "
+        "part of binaries that intend to use them. (As opposed to, for "
+        "example, being placed in a library that may be linked in to get other "
+        "utilities.)"
+        "\n\n"
+        "To suppress this error for this test suite, insert the following line "
+        "(in a non-header) in the namespace it is defined in:"
+        "\n\n"
+        "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+        testcase.first + ");";
+
+    std::string full_name =
+        "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">";
+    RegisterTest(  //
+        "GoogleTestVerification", full_name.c_str(),
+        nullptr,  // No type parameter.
+        nullptr,  // No value parameter.
+        testcase.second.code_location.file.c_str(),
+        testcase.second.code_location.line, [message, testcase] {
+          return new FailureTest(testcase.second.code_location, message,
+                                 kErrorOnUninstantiatedTypeParameterizedTest);
+        });
+  }
 }
 
 // A copy of all command line arguments.  Set by InitGoogleTest().
@@ -435,7 +628,8 @@ FilePath GetCurrentExecutableName() {
 
 // Returns the output format, or "" for normal printed output.
 std::string UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
   const char* const colon = strchr(gtest_output_flag, ':');
   return (colon == nullptr)
              ? std::string(gtest_output_flag)
@@ -446,19 +640,19 @@ std::string UnitTestOptions::GetOutputFormat() {
 // Returns the name of the requested output file, or the default if none
 // was explicitly specified.
 std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
 
   std::string format = GetOutputFormat();
-  if (format.empty())
-    format = std::string(kDefaultOutputFormat);
+  if (format.empty()) format = std::string(kDefaultOutputFormat);
 
   const char* const colon = strchr(gtest_output_flag, ':');
   if (colon == nullptr)
     return internal::FilePath::MakeFileName(
-        internal::FilePath(
-            UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile), 0,
-        format.c_str()).string();
+               internal::FilePath(
+                   UnitTest::GetInstance()->original_working_dir()),
+               internal::FilePath(kDefaultOutputFile), 0, format.c_str())
+        .string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
@@ -466,8 +660,7 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
         internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
         internal::FilePath(colon + 1));
 
-  if (!output_name.IsDirectory())
-    return output_name.string();
+  if (!output_name.IsDirectory()) return output_name.string();
 
   internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
       output_name, internal::GetCurrentExecutableName(),
@@ -475,77 +668,172 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
   return result.string();
 }
 
-// Returns true if and only if the wildcard pattern matches the string.
-// The first ':' or '\0' character in pattern marks the end of it.
+// Returns true if and only if the wildcard pattern matches the string. Each
+// pattern consists of regular characters, single-character wildcards (?), and
+// multi-character wildcards (*).
 //
-// This recursive algorithm isn't very efficient, but is clear and
-// works well enough for matching test names, which are short.
-bool UnitTestOptions::PatternMatchesString(const char *pattern,
-                                           const char *str) {
-  switch (*pattern) {
-    case '\0':
-    case ':':  // Either ':' or '\0' marks the end of the pattern.
-      return *str == '\0';
-    case '?':  // Matches any single character.
-      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
-    case '*':  // Matches any string (possibly empty) of characters.
-      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-          PatternMatchesString(pattern + 1, str);
-    default:  // Non-special character.  Matches itself.
-      return *pattern == *str &&
-          PatternMatchesString(pattern + 1, str + 1);
-  }
-}
-
-bool UnitTestOptions::MatchesFilter(
-    const std::string& name, const char* filter) {
-  const char *cur_pattern = filter;
-  for (;;) {
-    if (PatternMatchesString(cur_pattern, name.c_str())) {
-      return true;
+// This function implements a linear-time string globbing algorithm based on
+// https://research.swtch.com/glob.
+static bool PatternMatchesString(const std::string& name_str,
+                                 const char* pattern, const char* pattern_end) {
+  const char* name = name_str.c_str();
+  const char* const name_begin = name;
+  const char* const name_end = name + name_str.size();
+
+  const char* pattern_next = pattern;
+  const char* name_next = name;
+
+  while (pattern < pattern_end || name < name_end) {
+    if (pattern < pattern_end) {
+      switch (*pattern) {
+        default:  // Match an ordinary character.
+          if (name < name_end && *name == *pattern) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '?':  // Match any single character.
+          if (name < name_end) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '*':
+          // Match zero or more characters. Start by skipping over the wildcard
+          // and matching zero characters from name. If that fails, restart and
+          // match one more character than the last attempt.
+          pattern_next = pattern;
+          name_next = name + 1;
+          ++pattern;
+          continue;
+      }
+    }
+    // Failed to match a character. Restart if possible.
+    if (name_begin < name_next && name_next <= name_end) {
+      pattern = pattern_next;
+      name = name_next;
+      continue;
     }
+    return false;
+  }
+  return true;
+}
 
-    // Finds the next pattern in the filter.
-    cur_pattern = strchr(cur_pattern, ':');
+namespace {
+
+bool IsGlobPattern(const std::string& pattern) {
+  return std::any_of(pattern.begin(), pattern.end(),
+                     [](const char c) { return c == '?' || c == '*'; });
+}
+
+class UnitTestFilter {
+ public:
+  UnitTestFilter() = default;
+
+  // Constructs a filter from a string of patterns separated by `:`.
+  explicit UnitTestFilter(const std::string& filter) {
+    // By design "" filter matches "" string.
+    std::vector<std::string> all_patterns;
+    SplitString(filter, ':', &all_patterns);
+    const auto exact_match_patterns_begin = std::partition(
+        all_patterns.begin(), all_patterns.end(), &IsGlobPattern);
+
+    glob_patterns_.reserve(static_cast<size_t>(
+        std::distance(all_patterns.begin(), exact_match_patterns_begin)));
+    std::move(all_patterns.begin(), exact_match_patterns_begin,
+              std::inserter(glob_patterns_, glob_patterns_.begin()));
+    std::move(
+        exact_match_patterns_begin, all_patterns.end(),
+        std::inserter(exact_match_patterns_, exact_match_patterns_.begin()));
+  }
+
+  // Returns true if and only if name matches at least one of the patterns in
+  // the filter.
+  bool MatchesName(const std::string& name) const {
+    return exact_match_patterns_.count(name) > 0 ||
+           std::any_of(glob_patterns_.begin(), glob_patterns_.end(),
+                       [&name](const std::string& pattern) {
+                         return PatternMatchesString(
+                             name, pattern.c_str(),
+                             pattern.c_str() + pattern.size());
+                       });
+  }
 
-    // Returns if no more pattern can be found.
-    if (cur_pattern == nullptr) {
-      return false;
+ private:
+  std::vector<std::string> glob_patterns_;
+  std::unordered_set<std::string> exact_match_patterns_;
+};
+
+class PositiveAndNegativeUnitTestFilter {
+ public:
+  // Constructs a positive and a negative filter from a string. The string
+  // contains a positive filter optionally followed by a '-' character and a
+  // negative filter. In case only a negative filter is provided the positive
+  // filter will be assumed "*".
+  // A filter is a list of patterns separated by ':'.
+  explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) {
+    std::vector<std::string> positive_and_negative_filters;
+
+    // NOTE: `SplitString` always returns a non-empty container.
+    SplitString(filter, '-', &positive_and_negative_filters);
+    const auto& positive_filter = positive_and_negative_filters.front();
+
+    if (positive_and_negative_filters.size() > 1) {
+      positive_filter_ = UnitTestFilter(
+          positive_filter.empty() ? kUniversalFilter : positive_filter);
+
+      // TODO(b/214626361): Fail on multiple '-' characters
+      // For the moment to preserve old behavior we concatenate the rest of the
+      // string parts with `-` as separator to generate the negative filter.
+      auto negative_filter_string = positive_and_negative_filters[1];
+      for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++)
+        negative_filter_string =
+            negative_filter_string + '-' + positive_and_negative_filters[i];
+      negative_filter_ = UnitTestFilter(negative_filter_string);
+    } else {
+      // In case we don't have a negative filter and positive filter is ""
+      // we do not use kUniversalFilter by design as opposed to when we have a
+      // negative filter.
+      positive_filter_ = UnitTestFilter(positive_filter);
     }
+  }
 
-    // Skips the pattern separater (the ':' character).
-    cur_pattern++;
+  // Returns true if and only if test name (this is generated by appending test
+  // suit name and test name via a '.' character) matches the positive filter
+  // and does not match the negative filter.
+  bool MatchesTest(const std::string& test_suite_name,
+                   const std::string& test_name) const {
+    return MatchesName(test_suite_name + "." + test_name);
   }
+
+  // Returns true if and only if name matches the positive filter and does not
+  // match the negative filter.
+  bool MatchesName(const std::string& name) const {
+    return positive_filter_.MatchesName(name) &&
+           !negative_filter_.MatchesName(name);
+  }
+
+ private:
+  UnitTestFilter positive_filter_;
+  UnitTestFilter negative_filter_;
+};
+}  // namespace
+
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+                                    const char* filter) {
+  return UnitTestFilter(filter).MatchesName(name_str);
 }
 
 // Returns true if and only if the user-specified filter matches the test
 // suite name and the test name.
 bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
                                         const std::string& test_name) {
-  const std::string& full_name = test_suite_name + "." + test_name.c_str();
-
   // Split --gtest_filter at '-', if there is one, to separate into
   // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
-  std::string positive;
-  std::string negative;
-  if (dash == nullptr) {
-    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = "";
-  } else {
-    positive = std::string(p, dash);   // Everything up to the dash
-    negative = std::string(dash + 1);  // Everything after the dash
-    if (positive.empty()) {
-      // Treat '-test1' as the same as '*-test1'
-      positive = kUniversalFilter;
-    }
-  }
-
-  // A filter is a colon-separated list of patterns.  It matches a
-  // test if any pattern in it matches the test.
-  return (MatchesFilter(full_name, positive.c_str()) &&
-          !MatchesFilter(full_name, negative.c_str()));
+  return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter))
+      .MatchesTest(test_suite_name, test_name);
 }
 
 #if GTEST_HAS_SEH
@@ -565,7 +853,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
 
   bool should_handle = true;
 
-  if (!GTEST_FLAG(catch_exceptions))
+  if (!GTEST_FLAG_GET(catch_exceptions))
     should_handle = false;
   else if (exception_code == EXCEPTION_BREAKPOINT)
     should_handle = false;
@@ -583,8 +871,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
 // results. Intercepts only failures from the current thread.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
     TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
   Init();
 }
 
@@ -593,8 +880,7 @@ ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
 // results.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
     InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
+    : intercept_mode_(intercept_mode), result_(result) {
   Init();
 }
 
@@ -638,9 +924,7 @@ namespace internal {
 // from user test code.  GetTestTypeId() is guaranteed to always
 // return the same value, as it always calls GetTypeId<>() from the
 // gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
+TypeId GetTestTypeId() { return GetTypeId<Test>(); }
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
@@ -655,9 +939,9 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
                                      const TestPartResultArray& results,
                                      TestPartResult::Type type,
                                      const std::string& substr) {
-  const std::string expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
+  const std::string expected(type == TestPartResult::kFatalFailure
+                                 ? "1 fatal failure"
+                                 : "1 non-fatal failure");
   Message msg;
   if (results.size() != 1) {
     msg << "Expected: " << expected << "\n"
@@ -676,10 +960,10 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
   }
 
   if (strstr(r.message(), substr.c_str()) == nullptr) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
+    return AssertionFailure()
+           << "Expected: " << expected << " containing \"" << substr << "\"\n"
+           << "  Actual:\n"
+           << r;
   }
 
   return AssertionSuccess();
@@ -702,7 +986,8 @@ SingleFailureChecker::~SingleFailureChecker() {
 }
 
 DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
     const TestPartResult& result) {
@@ -711,7 +996,8 @@ void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
 }
 
 DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
     const TestPartResult& result) {
@@ -818,51 +1104,36 @@ int UnitTestImpl::test_to_run_count() const {
 // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
 std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
   return os_stack_trace_getter()->CurrentStackTrace(
-      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
-      skip_count + 1
+      static_cast<int>(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1
       // Skips the user-specified number of frames plus this function
       // itself.
-      );  // NOLINT
+  );  // NOLINT
 }
 
-// Returns the current time in milliseconds.
+// A helper class for measuring elapsed times.
+class Timer {
+ public:
+  Timer() : start_(std::chrono::steady_clock::now()) {}
+
+  // Return time elapsed in milliseconds since the timer was created.
+  TimeInMillis Elapsed() {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(
+               std::chrono::steady_clock::now() - start_)
+        .count();
+  }
+
+ private:
+  std::chrono::steady_clock::time_point start_;
+};
+
+// Returns a timestamp as milliseconds since the epoch. Note this time may jump
+// around subject to adjustments by the system, to measure elapsed time use
+// Timer instead.
 TimeInMillis GetTimeInMillis() {
-#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
-  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
-  // http://analogous.blogspot.com/2005/04/epoch.html
-  const TimeInMillis kJavaEpochToWinFileTimeDelta =
-    static_cast<TimeInMillis>(116444736UL) * 100000UL;
-  const DWORD kTenthMicrosInMilliSecond = 10000;
-
-  SYSTEMTIME now_systime;
-  FILETIME now_filetime;
-  ULARGE_INTEGER now_int64;
-  GetSystemTime(&now_systime);
-  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
-    now_int64.LowPart = now_filetime.dwLowDateTime;
-    now_int64.HighPart = now_filetime.dwHighDateTime;
-    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-      kJavaEpochToWinFileTimeDelta;
-    return now_int64.QuadPart;
-  }
-  return 0;
-#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
-  __timeb64 now;
-
-  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
-  // (deprecated function) there.
-  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
-  _ftime64(&now);
-  GTEST_DISABLE_MSC_DEPRECATED_POP_()
-
-  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
-#elif GTEST_HAS_GETTIMEOFDAY_
-  struct timeval now;
-  gettimeofday(&now, nullptr);
-  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
-#else
-# error "Don't know how to get the current time on your system."
-#endif
+  return std::chrono::duration_cast<std::chrono::milliseconds>(
+             std::chrono::system_clock::now() -
+             std::chrono::system_clock::from_time_t(0))
+      .count();
 }
 
 // Utilities
@@ -880,8 +1151,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
   const int unicode_length =
       MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
   WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
+  MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
   unicode[unicode_length] = 0;
   return unicode;
 }
@@ -890,7 +1160,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
 // memory using new. The caller is responsible for deleting the return
 // value using delete[]. Returns the ANSI string, or NULL if the
 // input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str) {
   if (!utf16_str) return nullptr;
   const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
                                               0, nullptr, nullptr);
@@ -909,7 +1179,7 @@ const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
 // Unlike strcmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
+bool String::CStringEquals(const char* lhs, const char* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -923,11 +1193,10 @@ bool String::CStringEquals(const char * lhs, const char * rhs) {
 // encoding, and streams the result to the given Message object.
 static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
                                      Message* msg) {
-  for (size_t i = 0; i != length; ) {  // NOLINT
+  for (size_t i = 0; i != length;) {  // NOLINT
     if (wstr[i] != L'\0') {
       *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
+      while (i != length && wstr[i] != L'\0') i++;
     } else {
       *msg << '\0';
       i++;
@@ -969,17 +1238,17 @@ Message::Message() : ss_(new ::std::stringstream) {
 
 // These two overloads allow streaming a wide C string to a Message
 // using the UTF-8 encoding.
-Message& Message::operator <<(const wchar_t* wide_c_str) {
+Message& Message::operator<<(const wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
-Message& Message::operator <<(wchar_t* wide_c_str) {
+Message& Message::operator<<(wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
 
 #if GTEST_HAS_STD_WSTRING
 // Converts the given wide string to a narrow string using the UTF-8
 // encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
+Message& Message::operator<<(const ::std::wstring& wstr) {
   internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
   return *this;
 }
@@ -991,44 +1260,6 @@ std::string Message::GetString() const {
   return internal::StringStreamToString(ss_.get());
 }
 
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_.get() != nullptr
-                   ? new ::std::string(*other.message_)
-                   : static_cast< ::std::string*>(nullptr)) {}
-
-// Swaps two AssertionResults.
-void AssertionResult::swap(AssertionResult& other) {
-  using std::swap;
-  swap(success_, other.success_);
-  swap(message_, other.message_);
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_.get() != nullptr) negation << *message_;
-  return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
-  return AssertionFailure() << message;
-}
-
 namespace internal {
 
 namespace edit_distance {
@@ -1320,8 +1551,7 @@ std::vector<std::string> SplitEscapedString(const std::string& str) {
 AssertionResult EqFailure(const char* lhs_expression,
                           const char* rhs_expression,
                           const std::string& lhs_value,
-                          const std::string& rhs_value,
-                          bool ignoring_case) {
+                          const std::string& rhs_value, bool ignoring_case) {
   Message msg;
   msg << "Expected equality of these values:";
   msg << "\n  " << lhs_expression;
@@ -1338,10 +1568,8 @@ AssertionResult EqFailure(const char* lhs_expression,
   }
 
   if (!lhs_value.empty() && !rhs_value.empty()) {
-    const std::vector<std::string> lhs_lines =
-        SplitEscapedString(lhs_value);
-    const std::vector<std::string> rhs_lines =
-        SplitEscapedString(rhs_value);
+    const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
     if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
       msg << "\nWith diff:\n"
           << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
@@ -1353,45 +1581,61 @@ AssertionResult EqFailure(const char* lhs_expression,
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value) {
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value) {
   const char* actual_message = assertion_result.message();
   Message msg;
   msg << "Value of: " << expression_text
       << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
+  if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
   msg << "\nExpected: " << expected_predicate_value;
   return msg.GetString();
 }
 
 // Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
+AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2,
+                                     const char* abs_error_expr, double val1,
+                                     double val2, double abs_error) {
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
+  // Find the value which is closest to zero.
+  const double min_abs = std::min(fabs(val1), fabs(val2));
+  // Find the distance to the next double from that value.
+  const double epsilon =
+      nextafter(min_abs, std::numeric_limits<double>::infinity()) - min_abs;
+  // Detect the case where abs_error is so small that EXPECT_NEAR is
+  // effectively the same as EXPECT_EQUAL, and give an informative error
+  // message so that the situation can be more easily understood without
+  // requiring exotic floating-point knowledge.
+  // Don't do an epsilon check if abs_error is zero because that implies
+  // that an equality check was actually intended.
+  if (!(std::isnan)(val1) && !(std::isnan)(val2) && abs_error > 0 &&
+      abs_error < epsilon) {
+    return AssertionFailure()
+           << "The difference between " << expr1 << " and " << expr2 << " is "
+           << diff << ", where\n"
+           << expr1 << " evaluates to " << val1 << ",\n"
+           << expr2 << " evaluates to " << val2 << ".\nThe abs_error parameter "
+           << abs_error_expr << " evaluates to " << abs_error
+           << " which is smaller than the minimum distance between doubles for "
+              "numbers of this magnitude which is "
+           << epsilon
+           << ", thus making this EXPECT_NEAR check equivalent to "
+              "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
+  }
   return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
+         << "The difference between " << expr1 << " and " << expr2 << " is "
+         << diff << ", which exceeds " << abs_error_expr << ", where\n"
+         << expr1 << " evaluates to " << val1 << ",\n"
+         << expr2 << " evaluates to " << val2 << ", and\n"
+         << abs_error_expr << " evaluates to " << abs_error << ".";
 }
 
-
 // Helper template for implementing FloatLE() and DoubleLE().
 template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
+AssertionResult FloatingPointLE(const char* expr1, const char* expr2,
+                                RawType val1, RawType val2) {
   // Returns success if val1 is less than val2,
   if (val1 < val2) {
     return AssertionSuccess();
@@ -1416,138 +1660,76 @@ AssertionResult FloatingPointLE(const char* expr1,
           << val2;
 
   return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
+         << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+         << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+         << StringStreamToString(&val2_ss);
 }
 
 }  // namespace internal
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
+AssertionResult FloatLE(const char* expr1, const char* expr2, float val1,
+                        float val2) {
   return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
 }
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
+AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1,
+                         double val2) {
   return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
 }
 
 namespace internal {
 
-// The helper function for {ASSERT|EXPECT}_EQ with int or enum
-// arguments.
-AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            BiggestInt lhs,
-                            BiggestInt rhs) {
-  if (lhs == rhs) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
-// just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   BiggestInt val1, BiggestInt val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}
-
-// Implements the helper function for {ASSERT|EXPECT}_NE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(NE, !=)
-// Implements the helper function for {ASSERT|EXPECT}_LE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LE, <=)
-// Implements the helper function for {ASSERT|EXPECT}_LT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, < )
-// Implements the helper function for {ASSERT|EXPECT}_GE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GE, >=)
-// Implements the helper function for {ASSERT|EXPECT}_GT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, > )
-
-#undef GTEST_IMPL_CMP_HELPER_
-
 // The helper function for {ASSERT|EXPECT}_STREQ.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const char* lhs,
+                               const char* rhs_expression, const char* lhs,
                                const char* rhs) {
   if (String::CStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const char* lhs,
+                                   const char* rhs_expression, const char* lhs,
                                    const char* rhs) {
   if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   true);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), true);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
+                               const char* s2_expression, const char* s1,
                                const char* s2) {
   if (!String::CStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
+    return AssertionFailure()
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
+                                   const char* s2_expression, const char* s1,
                                    const char* s2) {
   if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
     return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
@@ -1575,8 +1757,7 @@ bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
 
 // StringType here can be either ::std::string or ::std::wstring.
 template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
+bool IsSubstringPred(const StringType& needle, const StringType& haystack) {
   return haystack.find(needle) != StringType::npos;
 }
 
@@ -1585,21 +1766,22 @@ bool IsSubstringPred(const StringType& needle,
 // StringType here can be const char*, const wchar_t*, ::std::string,
 // or ::std::wstring.
 template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
+AssertionResult IsSubstringImpl(bool expected_to_be_substring,
+                                const char* needle_expr,
+                                const char* haystack_expr,
+                                const StringType& needle,
+                                const StringType& haystack) {
   if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
     return AssertionSuccess();
 
   const bool is_wide_string = sizeof(needle[0]) > 1;
   const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
   return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
+         << "Value of: " << needle_expr << "\n"
+         << "  Actual: " << begin_string_quote << needle << "\"\n"
+         << "Expected: " << (expected_to_be_substring ? "" : "not ")
+         << "a substring of " << haystack_expr << "\n"
+         << "Which is: " << begin_string_quote << haystack << "\"";
 }
 
 }  // namespace
@@ -1608,52 +1790,52 @@ AssertionResult IsSubstringImpl(
 // substring of haystack (NULL is considered a substring of itself
 // only), and return an appropriate error message when they fail.
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const char* needle, const char* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const wchar_t* needle, const wchar_t* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const char* needle,
+                               const char* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const wchar_t* needle,
+                               const wchar_t* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::string& needle,
+                            const ::std::string& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::string& needle,
+                               const ::std::string& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
 #if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::wstring& needle,
+                            const ::std::wstring& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::wstring& needle,
+                               const ::std::wstring& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 #endif  // GTEST_HAS_STD_WSTRING
@@ -1665,43 +1847,42 @@ namespace internal {
 namespace {
 
 // Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
+AssertionResult HRESULTFailureHelper(const char* expr, const char* expected,
                                      long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
 
   // Windows CE doesn't support FormatMessage.
   const char error_text[] = "";
 
-# else
+#else
 
   // Looks up the human-readable system message for the HRESULT code
   // and since we're not passing any params to FormatMessage, we don't
   // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kFlags =
+      FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
   const DWORD kBufSize = 4096;
   // Gets the system's human readable message string for this HRESULT.
-  char error_text[kBufSize] = { '\0' };
+  char error_text[kBufSize] = {'\0'};
   DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,   // no source, we're asking system
+                                          0,  // no source, we're asking system
                                           static_cast<DWORD>(hr),  // the error
-                                          0,   // no line width restrictions
+                                          0,  // no line width restrictions
                                           error_text,  // output buffer
                                           kBufSize,    // buf size
                                           nullptr);  // no arguments for inserts
   // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
   for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
+       --message_length) {
     error_text[message_length - 1] = '\0';
   }
 
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 
   const std::string error_hex("0x" + String::FormatHexInt(hr));
   return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << " " << error_text << "\n";
+         << "Expected: " << expr << " " << expected << ".\n"
+         << "  Actual: " << error_hex << " " << error_text << "\n";
 }
 
 }  // namespace
@@ -1735,33 +1916,35 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 //  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 
 // The maximum code-point a one-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
 
 // The maximum code-point a two-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
 
 // The maximum code-point a three-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+constexpr uint32_t kMaxCodePoint3 =
+    (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
 
 // The maximum code-point a four-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+constexpr uint32_t kMaxCodePoint4 =
+    (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
 
 // Chops off the n lowest bits from a bit pattern.  Returns the n
 // lowest bits.  As a side effect, the original bit pattern will be
 // shifted to the right by n bits.
-inline UInt32 ChopLowBits(UInt32* bits, int n) {
-  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+inline uint32_t ChopLowBits(uint32_t* bits, int n) {
+  const uint32_t low_bits = *bits & ((static_cast<uint32_t>(1) << n) - 1);
   *bits >>= n;
   return low_bits;
 }
 
 // Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
+// code_point parameter is of type uint32_t because wchar_t may not be
 // wide enough to contain a code point.
 // If the code_point is not a valid Unicode code point
 // (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
 // to "(Invalid Unicode 0xXXXXXXXX)".
-std::string CodePointToUtf8(UInt32 code_point) {
+std::string CodePointToUtf8(uint32_t code_point) {
   if (code_point > kMaxCodePoint4) {
     return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")";
   }
@@ -1769,7 +1952,7 @@ std::string CodePointToUtf8(UInt32 code_point) {
   char str[5];  // Big enough for the largest valid code point.
   if (code_point <= kMaxCodePoint1) {
     str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+    str[0] = static_cast<char>(code_point);  // 0xxxxxxx
   } else if (code_point <= kMaxCodePoint2) {
     str[2] = '\0';
     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
@@ -1797,16 +1980,16 @@ std::string CodePointToUtf8(UInt32 code_point) {
 // and thus should be combined into a single Unicode code point
 // using CreateCodePointFromUtf16SurrogatePair.
 inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+  return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
+         (second & 0xFC00) == 0xDC00;
 }
 
 // Creates a Unicode code point from UTF16 surrogate pair.
-inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
-                                                    wchar_t second) {
-  const auto first_u = static_cast<UInt32>(first);
-  const auto second_u = static_cast<UInt32>(second);
-  const UInt32 mask = (1 << 10) - 1;
+inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                      wchar_t second) {
+  const auto first_u = static_cast<uint32_t>(first);
+  const auto second_u = static_cast<uint32_t>(second);
+  const uint32_t mask = (1 << 10) - 1;
   return (sizeof(wchar_t) == 2)
              ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000
              :
@@ -1829,21 +2012,20 @@ inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
 std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
+  if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
 
   ::std::stringstream stream;
   for (int i = 0; i < num_chars; ++i) {
-    UInt32 unicode_code_point;
+    uint32_t unicode_code_point;
 
     if (str[i] == L'\0') {
       break;
     } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
+      unicode_code_point =
+          CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
       i++;
     } else {
-      unicode_code_point = static_cast<UInt32>(str[i]);
+      unicode_code_point = static_cast<uint32_t>(str[i]);
     }
 
     stream << CodePointToUtf8(unicode_code_point);
@@ -1853,7 +2035,7 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
 
 // Converts a wide C string to an std::string using the UTF-8 encoding.
 // NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+std::string String::ShowWideCString(const wchar_t* wide_c_str) {
   if (wide_c_str == nullptr) return "(null)";
 
   return internal::WideStringToUtf8(wide_c_str, -1);
@@ -1865,7 +2047,7 @@ std::string String::ShowWideCString(const wchar_t * wide_c_str) {
 // Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -1875,33 +2057,27 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
 
 // Helper function for *_STREQ on wide strings.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const wchar_t* lhs,
+                               const char* rhs_expression, const wchar_t* lhs,
                                const wchar_t* rhs) {
   if (String::WideCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // Helper function for *_STRNE on wide strings.
 AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
+                               const char* s2_expression, const wchar_t* s1,
                                const wchar_t* s2) {
   if (!String::WideCStringEquals(s1, s2)) {
     return AssertionSuccess();
   }
 
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << PrintToString(s1)
-                            << " vs " << PrintToString(s2);
+  return AssertionFailure()
+         << "Expected: (" << s1_expression << ") != (" << s2_expression
+         << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
 }
 
 // Compares two C strings, ignoring case.  Returns true if and only if they have
@@ -1910,7 +2086,7 @@ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 // Unlike strcasecmp(), this function can handle NULL argument(s).  A
 // NULL C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
   if (rhs == nullptr) return false;
   return posix::StrCaseCmp(lhs, rhs) == 0;
@@ -1952,8 +2128,8 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
 
 // Returns true if and only if str ends with the given suffix, ignoring case.
 // Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(
-    const std::string& str, const std::string& suffix) {
+bool String::EndsWithCaseInsensitive(const std::string& str,
+                                     const std::string& suffix) {
   const size_t str_len = str.length();
   const size_t suffix_len = suffix.length();
   return (str_len >= suffix_len) &&
@@ -1963,13 +2139,18 @@ bool String::EndsWithCaseInsensitive(
 
 // Formats an int value as "%02d".
 std::string String::FormatIntWidth2(int value) {
+  return FormatIntWidthN(value, 2);
+}
+
+// Formats an int value to given width with leading zeros.
+std::string String::FormatIntWidthN(int value, int width) {
   std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << value;
+  ss << std::setfill('0') << std::setw(width) << value;
   return ss.str();
 }
 
 // Formats an int value as "%X".
-std::string String::FormatHexUInt32(UInt32 value) {
+std::string String::FormatHexUInt32(uint32_t value) {
   std::stringstream ss;
   ss << std::hex << std::uppercase << value;
   return ss.str();
@@ -1977,7 +2158,7 @@ std::string String::FormatHexUInt32(UInt32 value) {
 
 // Formats an int value as "%X".
 std::string String::FormatHexInt(int value) {
-  return FormatHexUInt32(static_cast<UInt32>(value));
+  return FormatHexUInt32(static_cast<uint32_t>(value));
 }
 
 // Formats a byte as "%02X".
@@ -2016,7 +2197,9 @@ std::string AppendUserMessage(const std::string& gtest_msg,
   if (user_msg_string.empty()) {
     return gtest_msg;
   }
-
+  if (gtest_msg.empty()) {
+    return user_msg_string;
+  }
   return gtest_msg + "\n" + user_msg_string;
 }
 
@@ -2029,15 +2212,13 @@ TestResult::TestResult()
     : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
 
 // D'tor.
-TestResult::~TestResult() {
-}
+TestResult::~TestResult() {}
 
 // Returns the i-th test part result among all the results. i can
 // range from 0 to total_part_count() - 1. If i is not in that range,
 // aborts the program.
 const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
+  if (i < 0 || i >= total_part_count()) internal::posix::Abort();
   return test_part_results_.at(static_cast<size_t>(i));
 }
 
@@ -2045,15 +2226,12 @@ const TestPartResult& TestResult::GetTestPartResult(int i) const {
 // test_property_count() - 1. If i is not in that range, aborts the
 // program.
 const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
+  if (i < 0 || i >= test_property_count()) internal::posix::Abort();
   return test_properties_.at(static_cast<size_t>(i));
 }
 
 // Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
+void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
 
 // Adds a test part result to the list.
 void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
@@ -2068,7 +2246,7 @@ void TestResult::RecordProperty(const std::string& xml_element,
   if (!ValidateTestProperty(xml_element, test_property)) {
     return;
   }
-  internal::MutexLock lock(&test_properites_mutex_);
+  internal::MutexLock lock(&test_properties_mutex_);
   const std::vector<TestProperty>::iterator property_with_matching_key =
       std::find_if(test_properties_.begin(), test_properties_.end(),
                    internal::TestPropertyKeyIs(test_property.key()));
@@ -2082,25 +2260,19 @@ void TestResult::RecordProperty(const std::string& xml_element,
 // The list of reserved attributes used in the <testsuites> element of XML
 // output.
 static const char* const kReservedTestSuitesAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "random_seed",
-  "tests",
-  "time",
-  "timestamp"
-};
+    "disabled",    "errors", "failures", "name",
+    "random_seed", "tests",  "time",     "timestamp"};
 
 // The list of reserved attributes used in the <testsuite> element of XML
 // output.
 static const char* const kReservedTestSuiteAttributes[] = {
-    "disabled", "errors", "failures", "name", "tests", "time", "timestamp"};
+    "disabled", "errors", "failures",  "name",
+    "tests",    "time",   "timestamp", "skipped"};
 
 // The list of reserved attributes used in the <testcase> element of XML output.
 static const char* const kReservedTestCaseAttributes[] = {
-    "classname",   "name", "status", "time",  "type_param",
-    "value_param", "file", "line"};
+    "classname",  "name",        "status", "time",
+    "type_param", "value_param", "file",   "line"};
 
 // Use a slightly different set for allowed output to ensure existing tests can
 // still RecordProperty("result") or "RecordProperty(timestamp")
@@ -2108,7 +2280,7 @@ static const char* const kReservedOutputTestCaseAttributes[] = {
     "classname",   "name", "status", "time",   "type_param",
     "value_param", "file", "line",   "result", "timestamp"};
 
-template <int kSize>
+template <size_t kSize>
 std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
   return std::vector<std::string>(array, array + kSize);
 }
@@ -2162,7 +2334,7 @@ static bool ValidateTestPropertyName(
     const std::string& property_name,
     const std::vector<std::string>& reserved_names) {
   if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
-          reserved_names.end()) {
+      reserved_names.end()) {
     ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
                   << " (" << FormatWordList(reserved_names)
                   << " are reserved by " << GTEST_NAME_ << ")";
@@ -2200,8 +2372,7 @@ bool TestResult::Skipped() const {
 // Returns true if and only if the test failed.
 bool TestResult::Failed() const {
   for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
+    if (GetTestPartResult(i).failed()) return true;
   }
   return false;
 }
@@ -2242,27 +2413,22 @@ int TestResult::test_property_count() const {
 // Creates a Test object.
 
 // The c'tor saves the states of all flags.
-Test::Test()
-    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
-}
+Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
 
 // The d'tor restores the states of all flags.  The actual work is
 // done by the d'tor of the gtest_flag_saver_ field, and thus not
 // visible here.
-Test::~Test() {
-}
+Test::~Test() {}
 
 // Sets up the test fixture.
 //
 // A sub-class may override this.
-void Test::SetUp() {
-}
+void Test::SetUp() {}
 
 // Tears down the test fixture.
 //
 // A sub-class may override this.
-void Test::TearDown() {
-}
+void Test::TearDown() {}
 
 // Allows user supplied key value pairs to be recorded for later output.
 void Test::RecordProperty(const std::string& key, const std::string& value) {
@@ -2367,8 +2533,8 @@ bool Test::HasSameFixtureClass() {
 static std::string* FormatSehExceptionMessage(DWORD exception_code,
                                               const char* location) {
   Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
+  message << "SEH exception with code 0x" << std::setbase(16) << exception_code
+          << std::setbase(10) << " thrown in " << location << ".";
 
   return new std::string(message.GetString());
 }
@@ -2411,8 +2577,8 @@ GoogleTestFailureException::GoogleTestFailureException(
 // exceptions in the same function.  Therefore, we provide a separate
 // wrapper function for handling SEH exceptions.)
 template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                              const char* location) {
 #if GTEST_HAS_SEH
   __try {
     return (object->*method)();
@@ -2421,8 +2587,8 @@ Result HandleSehExceptionsInMethodIfSupported(
     // We create the exception message on the heap because VC++ prohibits
     // creation of objects with destructors on stack in functions using __try
     // (see error C2712).
-    std::string* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
+    std::string* exception_message =
+        FormatSehExceptionMessage(GetExceptionCode(), location);
     internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
                                              *exception_message);
     delete exception_message;
@@ -2438,8 +2604,8 @@ Result HandleSehExceptionsInMethodIfSupported(
 // exceptions, if they are supported; returns the 0-value for type
 // Result in case of an SEH exception.
 template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                           const char* location) {
   // NOTE: The user code can affect the way in which Google Test handles
   // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
   // RUN_ALL_TESTS() starts. It is technically possible to check the flag
@@ -2449,7 +2615,7 @@ Result HandleExceptionsInMethodIfSupported(
   // try {
   //   // Perform the test method.
   // } catch (...) {
-  //   if (GTEST_FLAG(catch_exceptions))
+  //   if (GTEST_FLAG_GET(catch_exceptions))
   //     // Report the exception as failure.
   //   else
   //     throw;  // Re-throws the original exception.
@@ -2505,16 +2671,16 @@ void Test::Run() {
   // GTEST_SKIP().
   if (!HasFatalFailure() && !IsSkipped()) {
     impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
+    internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
+                                                  "the test body");
   }
 
   // However, we want to clean up as much as possible.  Hence we will
   // always call TearDown(), even if SetUp() or the test body has
   // failed.
   impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
+                                                "TearDown()");
 }
 
 // Returns true if and only if the current test has a fatal failure.
@@ -2524,8 +2690,9 @@ bool Test::HasFatalFailure() {
 
 // Returns true if and only if the current test has a non-fatal failure.
 bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
+  return internal::GetUnitTestImpl()
+      ->current_test_result()
+      ->HasNonfatalFailure();
 }
 
 // Returns true if and only if the current test was skipped.
@@ -2552,6 +2719,7 @@ TestInfo::TestInfo(const std::string& a_test_suite_name,
       should_run_(false),
       is_disabled_(false),
       matches_filter_(false),
+      is_in_another_shard_(false),
       factory_(factory),
       result_() {}
 
@@ -2565,7 +2733,7 @@ namespace internal {
 //
 // Arguments:
 //
-//   test_suite_name:   name of the test suite
+//   test_suite_name:  name of the test suite
 //   name:             name of the test
 //   type_param:       the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
@@ -2624,11 +2792,10 @@ class TestNameIs {
   // Constructor.
   //
   // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
+  explicit TestNameIs(const char* name) : name_(name) {}
 
   // Returns true if and only if the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
+  bool operator()(const TestInfo* test_info) const {
     return test_info && test_info->name() == name_;
   }
 
@@ -2646,6 +2813,7 @@ namespace internal {
 void UnitTestImpl::RegisterParameterizedTests() {
   if (!parameterized_tests_registered_) {
     parameterized_test_registry_.RegisterTests();
+    type_parameterized_test_registry_.CheckForInstantiations();
     parameterized_tests_registered_ = true;
   }
 }
@@ -2655,19 +2823,20 @@ void UnitTestImpl::RegisterParameterizedTests() {
 // Creates the test object, runs it, records its result, and then
 // deletes it.
 void TestInfo::Run() {
-  if (!should_run_) return;
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+  if (!should_run_) {
+    if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this);
+    return;
+  }
 
   // Tells UnitTest where to store test result.
   internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
   impl->set_current_test_info(this);
 
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
   // Notifies the unit test event listeners that a test is about to start.
   repeater->OnTestStart(*this);
-
-  const TimeInMillis start = internal::GetTimeInMillis();
-
+  result_.set_start_timestamp(internal::GetTimeInMillis());
+  internal::Timer timer;
   impl->os_stack_trace_getter()->UponLeavingGTest();
 
   // Creates the test object.
@@ -2691,8 +2860,7 @@ void TestInfo::Run() {
         test, &Test::DeleteSelf_, "the test fixture's destructor");
   }
 
-  result_.set_start_timestamp(start);
-  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+  result_.set_elapsed_time(timer.Elapsed());
 
   // Notifies the unit test event listener that a test has just finished.
   repeater->OnTestEnd(*this);
@@ -2702,6 +2870,28 @@ void TestInfo::Run() {
   impl->set_current_test_info(nullptr);
 }
 
+// Skip and records a skipped test result for this object.
+void TestInfo::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TestPartResult test_part_result =
+      TestPartResult(TestPartResult::kSkip, this->file(), this->line(), "");
+  impl->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      test_part_result);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+  impl->set_current_test_info(nullptr);
+}
+
 // class TestSuite
 
 // Gets the number of successful tests in this test suite.
@@ -2748,7 +2938,7 @@ int TestSuite::total_test_count() const {
 //
 // Arguments:
 //
-//   name:         name of the test suite
+//   a_name:       name of the test suite
 //   a_type_param: the name of the test suite's type parameter, or NULL if
 //                 this is not a typed or a type-parameterized test suite.
 //   set_up_tc:    pointer to the function that sets up the test suite
@@ -2803,19 +2993,33 @@ void TestSuite::Run() {
   // Call both legacy and the new API
   repeater->OnTestSuiteStart(*this);
 //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   repeater->OnTestCaseStart(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
       this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
 
+  const bool skip_all = ad_hoc_test_result().Failed();
+
   start_timestamp_ = internal::GetTimeInMillis();
+  internal::Timer timer;
   for (int i = 0; i < total_test_count(); i++) {
-    GetMutableTestInfo(i)->Run();
+    if (skip_all) {
+      GetMutableTestInfo(i)->Skip();
+    } else {
+      GetMutableTestInfo(i)->Run();
+    }
+    if (GTEST_FLAG_GET(fail_fast) &&
+        GetMutableTestInfo(i)->result()->Failed()) {
+      for (int j = i + 1; j < total_test_count(); j++) {
+        GetMutableTestInfo(j)->Skip();
+      }
+      break;
+    }
   }
-  elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_;
+  elapsed_time_ = timer.Elapsed();
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
@@ -2824,9 +3028,39 @@ void TestSuite::Run() {
   // Call both legacy and the new API
   repeater->OnTestSuiteEnd(*this);
 //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   repeater->OnTestCaseEnd(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  impl->set_current_test_suite(nullptr);
+}
+
+// Skips all tests under this TestSuite.
+void TestSuite::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_suite(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteStart(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseStart(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Skip();
+  }
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteEnd(*this);
+  // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseEnd(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   impl->set_current_test_suite(nullptr);
 }
@@ -2854,11 +3088,10 @@ void TestSuite::UnshuffleTests() {
 //
 // FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
 // FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count,
-                                       const char * singular_form,
-                                       const char * plural_form) {
+static std::string FormatCountableNoun(int count, const char* singular_form,
+                                       const char* plural_form) {
   return internal::StreamableToString(count) + " " +
-      (count == 1 ? singular_form : plural_form);
+         (count == 1 ? singular_form : plural_form);
 }
 
 // Formats the count of tests.
@@ -2875,10 +3108,10 @@ static std::string FormatTestSuiteCount(int test_suite_count) {
 // representation.  Both kNonFatalFailure and kFatalFailure are translated
 // to "Failure", as the user usually doesn't care about the difference
 // between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+static const char* TestPartResultTypeToString(TestPartResult::Type type) {
   switch (type) {
     case TestPartResult::kSkip:
-      return "Skipped";
+      return "Skipped\n";
     case TestPartResult::kSuccess:
       return "Success";
 
@@ -2895,21 +3128,25 @@ static const char * TestPartResultTypeToString(TestPartResult::Type type) {
 }
 
 namespace internal {
+namespace {
+enum class GTestColor { kDefault, kRed, kGreen, kYellow };
+}  // namespace
 
 // Prints a TestPartResult to an std::string.
 static std::string PrintTestPartResultToString(
     const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
+  return (Message() << internal::FormatFileLocation(
+                           test_part_result.file_name(),
+                           test_part_result.line_number())
+                    << " "
+                    << TestPartResultTypeToString(test_part_result.type())
+                    << test_part_result.message())
+      .GetString();
 }
 
 // Prints a TestPartResult.
 static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const std::string& result =
-      PrintTestPartResultToString(test_part_result);
+  const std::string& result = PrintTestPartResultToString(test_part_result);
   printf("%s\n", result.c_str());
   fflush(stdout);
   // If the test program runs in Visual Studio or a debugger, the
@@ -2926,16 +3163,20 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) {
 }
 
 // class PrettyUnitTestResultPrinter
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
 
 // Returns the character attribute for the given color.
 static WORD GetColorAttribute(GTestColor color) {
   switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
+    case GTestColor::kRed:
+      return FOREGROUND_RED;
+    case GTestColor::kGreen:
+      return FOREGROUND_GREEN;
+    case GTestColor::kYellow:
+      return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:
+      return 0;
   }
 }
 
@@ -2972,13 +3213,16 @@ static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
 
 #else
 
-// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// Returns the ANSI color code for the given color. GTestColor::kDefault is
 // an invalid input.
 static const char* GetAnsiColorCode(GTestColor color) {
   switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
+    case GTestColor::kRed:
+      return "1";
+    case GTestColor::kGreen:
+      return "2";
+    case GTestColor::kYellow:
+      return "3";
     default:
       return nullptr;
   }
@@ -2988,7 +3232,8 @@ static const char* GetAnsiColorCode(GTestColor color) {
 
 // Returns true if and only if Google Test should use colors in the output.
 bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
+  std::string c = GTEST_FLAG_GET(color);
+  const char* const gtest_color = c.c_str();
 
   if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
@@ -3015,9 +3260,9 @@ bool ShouldUseColor(bool stdout_is_tty) {
   }
 
   return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
+         String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+         String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+         String::CStringEquals(gtest_color, "1");
   // We take "yes", "true", "t", and "1" as meaning "yes".  If the
   // value is neither one of these nor "auto", we treat it as "no" to
   // be conservative.
@@ -3027,18 +3272,15 @@ bool ShouldUseColor(bool stdout_is_tty) {
 // cannot simply emit special characters and have the terminal change colors.
 // This routine must actually emit the characters rather than return a string
 // that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+
+GTEST_ATTRIBUTE_PRINTF_(2, 3)
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
-    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
-  const bool use_color = AlwaysFalse();
-#else
   static const bool in_color_mode =
       ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
-  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
+  const bool use_color = in_color_mode && (color != GTestColor::kDefault);
 
   if (!use_color) {
     vprintf(fmt, args);
@@ -3046,8 +3288,8 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
     return;
   }
 
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
@@ -3118,6 +3360,7 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
 #endif  // OnTestCaseStart
 
   void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
 
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
@@ -3134,39 +3377,40 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
 
  private:
   static void PrintFailedTests(const UnitTest& unit_test);
+  static void PrintFailedTestSuites(const UnitTest& unit_test);
   static void PrintSkippedTests(const UnitTest& unit_test);
 };
 
-  // Fired before each iteration of tests starts.
+// Fired before each iteration of tests starts.
 void PrettyUnitTestResultPrinter::OnTestIterationStart(
     const UnitTest& unit_test, int iteration) {
-  if (GTEST_FLAG(repeat) != 1)
+  if (GTEST_FLAG_GET(repeat) != 1)
     printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
 
-  const char* const filter = GTEST_FLAG(filter).c_str();
+  std::string f = GTEST_FLAG_GET(filter);
+  const char* const filter = f.c_str();
 
   // Prints the filter if it's not *.  This reminds the user that some
   // tests may be skipped.
   if (!String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+    ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_,
+                  filter);
   }
 
   if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
-    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: This is test shard %d of %s.\n",
+    const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n",
                   static_cast<int>(shard_index) + 1,
                   internal::posix::GetEnv(kTestTotalShards));
   }
 
-  if (GTEST_FLAG(shuffle)) {
-    ColoredPrintf(COLOR_YELLOW,
+  if (GTEST_FLAG_GET(shuffle)) {
+    ColoredPrintf(GTestColor::kYellow,
                   "Note: Randomizing tests' orders with a seed of %d .\n",
                   unit_test.random_seed());
   }
 
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
   printf("Running %s from %s.\n",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
@@ -3175,7 +3419,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
     const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("Global test environment set-up.\n");
   fflush(stdout);
 }
@@ -3184,7 +3428,7 @@ void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
 void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s", counts.c_str(), test_case.name());
   if (test_case.type_param() == nullptr) {
     printf("\n");
@@ -3198,7 +3442,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteStart(
     const TestSuite& test_suite) {
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s", counts.c_str(), test_suite.name());
   if (test_suite.type_param() == nullptr) {
     printf("\n");
@@ -3210,7 +3454,14 @@ void PrettyUnitTestResultPrinter::OnTestSuiteStart(
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
-  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  ColoredPrintf(GTestColor::kGreen, "[ RUN      ] ");
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) {
+  ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] ");
   PrintTestName(test_info.test_suite_name(), test_info.name());
   printf("\n");
   fflush(stdout);
@@ -3220,9 +3471,7 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
 void PrettyUnitTestResultPrinter::OnTestPartResult(
     const TestPartResult& result) {
   switch (result.type()) {
-    // If the test part succeeded, or was skipped,
-    // we don't need to do anything.
-    case TestPartResult::kSkip:
+    // If the test part succeeded, we don't need to do anything.
     case TestPartResult::kSuccess:
       return;
     default:
@@ -3235,19 +3484,19 @@ void PrettyUnitTestResultPrinter::OnTestPartResult(
 
 void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
   if (test_info.result()->Passed()) {
-    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+    ColoredPrintf(GTestColor::kGreen, "[       OK ] ");
   } else if (test_info.result()->Skipped()) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
   } else {
-    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   }
   PrintTestName(test_info.test_suite_name(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
+  if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
 
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
+  if (GTEST_FLAG_GET(print_time)) {
+    printf(" (%s ms)\n",
+           internal::StreamableToString(test_info.result()->elapsed_time())
+               .c_str());
   } else {
     printf("\n");
   }
@@ -3256,22 +3505,22 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
 
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
-  if (!GTEST_FLAG(print_time)) return;
+  if (!GTEST_FLAG_GET(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(),
          internal::StreamableToString(test_case.elapsed_time()).c_str());
   fflush(stdout);
 }
 #else
 void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
-  if (!GTEST_FLAG(print_time)) return;
+  if (!GTEST_FLAG_GET(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
          internal::StreamableToString(test_suite.elapsed_time()).c_str());
   fflush(stdout);
@@ -3280,7 +3529,7 @@ void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
     const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("Global test environment tear-down\n");
   fflush(stdout);
 }
@@ -3288,9 +3537,8 @@ void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
 // Internal helper for printing the list of failed tests.
 void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
   const int failed_test_count = unit_test.failed_test_count();
-  if (failed_test_count == 0) {
-    return;
-  }
+  ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+  printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
 
   for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
     const TestSuite& test_suite = *unit_test.GetTestSuite(i);
@@ -3302,12 +3550,36 @@ void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
       if (!test_info.should_run() || !test_info.result()->Failed()) {
         continue;
       }
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
       printf("%s.%s", test_suite.name(), test_info.name());
       PrintFullTestCommentIfPresent(test_info);
       printf("\n");
     }
   }
+  printf("\n%2d FAILED %s\n", failed_test_count,
+         failed_test_count == 1 ? "TEST" : "TESTS");
+}
+
+// Internal helper for printing the list of test suite failures not covered by
+// PrintFailedTests.
+void PrettyUnitTestResultPrinter::PrintFailedTestSuites(
+    const UnitTest& unit_test) {
+  int suite_failure_count = 0;
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run()) {
+      continue;
+    }
+    if (test_suite.ad_hoc_test_result().Failed()) {
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+      printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name());
+      ++suite_failure_count;
+    }
+  }
+  if (suite_failure_count > 0) {
+    printf("\n%2d FAILED TEST %s\n", suite_failure_count,
+           suite_failure_count == 1 ? "SUITE" : "SUITES");
+  }
 }
 
 // Internal helper for printing the list of skipped tests.
@@ -3327,7 +3599,7 @@ void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) {
       if (!test_info.should_run() || !test_info.result()->Skipped()) {
         continue;
       }
-      ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+      ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
       printf("%s.%s", test_suite.name(), test_info.name());
       printf("\n");
     }
@@ -3336,44 +3608,37 @@ void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) {
 
 void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
                                                      int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
+  if (GTEST_FLAG_GET(print_time)) {
     printf(" (%s ms total)",
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
   printf("\n");
-  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
   printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
 
   const int skipped_test_count = unit_test.skipped_test_count();
   if (skipped_test_count > 0) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
     printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
     PrintSkippedTests(unit_test);
   }
 
-  int num_failures = unit_test.failed_test_count();
   if (!unit_test.Passed()) {
-    const int failed_test_count = unit_test.failed_test_count();
-    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
-    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
     PrintFailedTests(unit_test);
-    printf("\n%2d FAILED %s\n", num_failures,
-                        num_failures == 1 ? "TEST" : "TESTS");
+    PrintFailedTestSuites(unit_test);
   }
 
   int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
-    if (!num_failures) {
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
+    if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
-    ColoredPrintf(COLOR_YELLOW,
-                  "  YOU HAVE %d DISABLED %s\n\n",
-                  num_disabled,
-                  num_disabled == 1 ? "TEST" : "TESTS");
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
   }
   // Ensure that Google Test output is printed before, e.g., heapchecker output.
   fflush(stdout);
@@ -3381,6 +3646,111 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
 
 // End PrettyUnitTestResultPrinter
 
+// This class implements the TestEventListener interface.
+//
+// Class BriefUnitTestResultPrinter is copyable.
+class BriefUnitTestResultPrinter : public TestEventListener {
+ public:
+  BriefUnitTestResultPrinter() {}
+  static void PrintTestName(const char* test_suite, const char* test) {
+    printf("%s.%s", test_suite, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int /*iteration*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
+#endif  // OnTestCaseStart
+
+  void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
+
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+};
+
+// Called after an assertion failure.
+void BriefUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  switch (result.type()) {
+    // If the test part succeeded, we don't need to do anything.
+    case TestPartResult::kSuccess:
+      return;
+    default:
+      // Print failure message from the assertion
+      // (e.g. expected this and got that).
+      PrintTestPartResult(result);
+      fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Failed()) {
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+    PrintTestName(test_info.test_suite_name(), test_info.name());
+    PrintFullTestCommentIfPresent(test_info);
+
+    if (GTEST_FLAG_GET(print_time)) {
+      printf(" (%s ms)\n",
+             internal::StreamableToString(test_info.result()->elapsed_time())
+                 .c_str());
+    } else {
+      printf("\n");
+    }
+    fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                    int /*iteration*/) {
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  if (GTEST_FLAG_GET(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count > 0) {
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
+    printf("%s.\n", FormatTestCount(skipped_test_count).c_str());
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
+    if (unit_test.Passed()) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End BriefUnitTestResultPrinter
+
 // class TestEventRepeater
 //
 // This class forwards events to other event listeners.
@@ -3388,7 +3758,7 @@ class TestEventRepeater : public TestEventListener {
  public:
   TestEventRepeater() : forwarding_enabled_(true) {}
   ~TestEventRepeater() override;
-  void Append(TestEventListener *listener);
+  void Append(TestEventListener* listener);
   TestEventListener* Release(TestEventListener* listener);
 
   // Controls whether events will be forwarded to listeners_. Set to false
@@ -3406,6 +3776,7 @@ class TestEventRepeater : public TestEventListener {
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   void OnTestSuiteStart(const TestSuite& parameter) override;
   void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
 //  Legacy API is deprecated but still available
@@ -3425,18 +3796,19 @@ class TestEventRepeater : public TestEventListener {
   // The list of listeners that receive events.
   std::vector<TestEventListener*> listeners_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+  TestEventRepeater(const TestEventRepeater&) = delete;
+  TestEventRepeater& operator=(const TestEventRepeater&) = delete;
 };
 
 TestEventRepeater::~TestEventRepeater() {
   ForEach(listeners_, Delete<TestEventListener>);
 }
 
-void TestEventRepeater::Append(TestEventListener *listener) {
+void TestEventRepeater::Append(TestEventListener* listener) {
   listeners_.push_back(listener);
 }
 
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+TestEventListener* TestEventRepeater::Release(TestEventListener* listener) {
   for (size_t i = 0; i < listeners_.size(); ++i) {
     if (listeners_[i] == listener) {
       listeners_.erase(listeners_.begin() + static_cast<int>(i));
@@ -3449,14 +3821,14 @@ TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
 
 // Since most methods are very similar, use macros to reduce boilerplate.
 // This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
+#define GTEST_REPEATER_METHOD_(Name, Type)              \
+  void TestEventRepeater::Name(const Type& parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = 0; i < listeners_.size(); i++) {  \
+        listeners_[i]->Name(parameter);                 \
+      }                                                 \
+    }                                                   \
+  }
 // This defines a member that forwards the call to all listeners in reverse
 // order.
 #define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
@@ -3476,6 +3848,7 @@ GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
 GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo)
 GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
 GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
 GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
@@ -3526,12 +3899,13 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
  private:
   // Is c a whitespace character that is normalized to a space character
   // when it appears in an XML attribute value?
-  static bool IsNormalizableWhitespace(char c) {
-    return c == 0x9 || c == 0xA || c == 0xD;
+  static bool IsNormalizableWhitespace(unsigned char c) {
+    return c == '\t' || c == '\n' || c == '\r';
   }
 
   // May c appear in a well-formed XML document?
-  static bool IsValidXmlCharacter(char c) {
+  // https://www.w3.org/TR/REC-xml/#charsets
+  static bool IsValidXmlCharacter(unsigned char c) {
     return IsNormalizableWhitespace(c) || c >= 0x20;
   }
 
@@ -3564,6 +3938,16 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
   static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
 
+  // Streams a test suite XML stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputXmlTestSuiteForTestResult(::std::ostream* stream,
+                                              const TestResult& result);
+
+  // Streams an XML representation of a TestResult object.
+  static void OutputXmlTestResult(::std::ostream* stream,
+                                  const TestResult& result);
+
   // Streams an XML representation of a TestInfo object.
   static void OutputXmlTestInfo(::std::ostream* stream,
                                 const char* test_suite_name,
@@ -3591,7 +3975,8 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // The output file.
   const std::string output_file_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+  XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete;
+  XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete;
 };
 
 // Creates a new XmlUnitTestResultPrinter.
@@ -3631,8 +4016,8 @@ void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
 // module will consist of ordinary English text.
 // If this module is ever modified to produce version 1.1 XML output,
 // most invalid characters can be retained using character references.
-std::string XmlUnitTestResultPrinter::EscapeXml(
-    const std::string& str, bool is_attribute) {
+std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str,
+                                                bool is_attribute) {
   Message m;
 
   for (size_t i = 0; i < str.size(); ++i) {
@@ -3660,8 +4045,9 @@ std::string XmlUnitTestResultPrinter::EscapeXml(
           m << '"';
         break;
       default:
-        if (IsValidXmlCharacter(ch)) {
-          if (is_attribute && IsNormalizableWhitespace(ch))
+        if (IsValidXmlCharacter(static_cast<unsigned char>(ch))) {
+          if (is_attribute &&
+              IsNormalizableWhitespace(static_cast<unsigned char>(ch)))
             m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
               << ";";
           else
@@ -3682,7 +4068,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
   std::string output;
   output.reserve(str.size());
   for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
+    if (IsValidXmlCharacter(static_cast<unsigned char>(*it)))
       output.push_back(*it);
 
   return output;
@@ -3690,7 +4076,6 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
 
 // The following routines generate an XML representation of a UnitTest
 // object.
-// GOOGLETEST_CM0009 DO NOT DELETE
 //
 // This is how Google Test concepts map to the DTD:
 //
@@ -3722,6 +4107,10 @@ static bool PortableLocaltime(time_t seconds, struct tm* out) {
   if (tm_ptr == nullptr) return false;
   *out = *tm_ptr;
   return true;
+#elif defined(__STDC_LIB_EXT1__)
+  // Uses localtime_s when available as localtime_r is only available from
+  // C23 standard.
+  return localtime_s(&seconds, out) != nullptr;
 #else
   return localtime_r(&seconds, out) != nullptr;
 #endif
@@ -3733,13 +4122,14 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
   struct tm time_struct;
   if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
     return "";
-  // YYYY-MM-DDThh:mm:ss
+  // YYYY-MM-DDThh:mm:ss.sss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec);
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "." +
+         String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
 }
 
 // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
@@ -3750,8 +4140,8 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
   for (;;) {
     const char* const next_segment = strstr(segment, "]]>");
     if (next_segment != nullptr) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
+      stream->write(segment,
+                    static_cast<std::streamsize>(next_segment - segment));
       *stream << "]]>]]&gt;<![CDATA[";
       segment = next_segment + strlen("]]>");
     } else {
@@ -3763,21 +4153,56 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
 }
 
 void XmlUnitTestResultPrinter::OutputXmlAttribute(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value) {
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, const std::string& value) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Attribute " << name << " is not allowed for element <" << element_name
       << ">.";
 
   *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
 }
 
+// Streams a test suite XML stanza containing the given test result.
+void XmlUnitTestResultPrinter::OutputXmlTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a minimal test suite with one test.
+  *stream << "  <testsuite";
+  OutputXmlAttribute(stream, "testsuite", "name", "NonTestSuiteFailure");
+  OutputXmlAttribute(stream, "testsuite", "tests", "1");
+  OutputXmlAttribute(stream, "testsuite", "failures", "1");
+  OutputXmlAttribute(stream, "testsuite", "disabled", "0");
+  OutputXmlAttribute(stream, "testsuite", "skipped", "0");
+  OutputXmlAttribute(stream, "testsuite", "errors", "0");
+  OutputXmlAttribute(stream, "testsuite", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testsuite", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+  *stream << ">";
+
+  // Output the boilerplate for a minimal test case with a single test.
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, "testcase", "name", "");
+  OutputXmlAttribute(stream, "testcase", "status", "run");
+  OutputXmlAttribute(stream, "testcase", "result", "completed");
+  OutputXmlAttribute(stream, "testcase", "classname", "");
+  OutputXmlAttribute(stream, "testcase", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testcase", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+
+  // Output the actual test result.
+  OutputXmlTestResult(stream, result);
+
+  // Complete the test suite.
+  *stream << "  </testsuite>\n";
+}
+
 // Prints an XML representation of a TestInfo object.
 void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
                                                  const char* test_suite_name,
@@ -3800,10 +4225,11 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
     OutputXmlAttribute(stream, kTestsuite, "type_param",
                        test_info.type_param());
   }
-  if (GTEST_FLAG(list_tests)) {
-    OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
-    OutputXmlAttribute(stream, kTestsuite, "line",
-                       StreamableToString(test_info.line()));
+
+  OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+  OutputXmlAttribute(stream, kTestsuite, "line",
+                     StreamableToString(test_info.line()));
+  if (GTEST_FLAG_GET(list_tests)) {
     *stream << " />\n";
     return;
   }
@@ -3821,30 +4247,48 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
       FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
   OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
 
+  OutputXmlTestResult(stream, result);
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
+                                                   const TestResult& result) {
   int failures = 0;
+  int skips = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
     const TestPartResult& part = result.GetTestPartResult(i);
     if (part.failed()) {
-      if (++failures == 1) {
+      if (++failures == 1 && skips == 0) {
         *stream << ">\n";
       }
       const std::string location =
           internal::FormatCompilerIndependentFileLocation(part.file_name(),
                                                           part.line_number());
       const std::string summary = location + "\n" + part.summary();
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary.c_str())
+      *stream << "      <failure message=\"" << EscapeXmlAttribute(summary)
               << "\" type=\"\">";
       const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
       *stream << "</failure>\n";
+    } else if (part.skipped()) {
+      if (++skips == 1 && failures == 0) {
+        *stream << ">\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
+      *stream << "      <skipped message=\""
+              << EscapeXmlAttribute(summary.c_str()) << "\">";
+      const std::string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</skipped>\n";
     }
   }
 
-  if (failures == 0 && result.test_property_count() == 0) {
+  if (failures == 0 && skips == 0 && result.test_property_count() == 0) {
     *stream << " />\n";
   } else {
-    if (failures == 0) {
+    if (failures == 0 && skips == 0) {
       *stream << ">\n";
     }
     OutputXmlTestProperties(stream, result);
@@ -3860,13 +4304,17 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
   OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
   OutputXmlAttribute(stream, kTestsuite, "tests",
                      StreamableToString(test_suite.reportable_test_count()));
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputXmlAttribute(stream, kTestsuite, "failures",
                        StreamableToString(test_suite.failed_test_count()));
     OutputXmlAttribute(
         stream, kTestsuite, "disabled",
         StreamableToString(test_suite.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "skipped",
+                       StreamableToString(test_suite.skipped_test_count()));
+
     OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+
     OutputXmlAttribute(stream, kTestsuite, "time",
                        FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
     OutputXmlAttribute(
@@ -3904,7 +4352,7 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
       stream, kTestsuites, "timestamp",
       FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
 
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     OutputXmlAttribute(stream, kTestsuites, "random_seed",
                        StreamableToString(unit_test.random_seed()));
   }
@@ -3917,6 +4365,13 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
     if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
       PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
   }
+
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputXmlTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
   *stream << "</" << kTestsuites << ">\n";
 }
 
@@ -3950,7 +4405,7 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
     attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+               << "\"" << EscapeXmlAttribute(property.value()) << "\"";
   }
   return attributes.GetString();
 }
@@ -3964,15 +4419,15 @@ void XmlUnitTestResultPrinter::OutputXmlTestProperties(
     return;
   }
 
-  *stream << "<" << kProperties << ">\n";
+  *stream << "      <" << kProperties << ">\n";
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
-    *stream << "<" << kProperty;
+    *stream << "        <" << kProperty;
     *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
     *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
     *stream << "/>\n";
   }
-  *stream << "</" << kProperties << ">\n";
+  *stream << "      </" << kProperties << ">\n";
 }
 
 // End XmlUnitTestResultPrinter
@@ -3996,16 +4451,22 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
   //// streams the attribute as JSON.
   static void OutputJsonKey(std::ostream* stream,
                             const std::string& element_name,
-                            const std::string& name,
-                            const std::string& value,
-                            const std::string& indent,
-                            bool comma = true);
+                            const std::string& name, const std::string& value,
+                            const std::string& indent, bool comma = true);
   static void OutputJsonKey(std::ostream* stream,
                             const std::string& element_name,
-                            const std::string& name,
-                            int value,
-                            const std::string& indent,
-                            bool comma = true);
+                            const std::string& name, int value,
+                            const std::string& indent, bool comma = true);
+
+  // Streams a test suite JSON stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputJsonTestSuiteForTestResult(::std::ostream* stream,
+                                               const TestResult& result);
+
+  // Streams a JSON representation of a TestResult object.
+  static void OutputJsonTestResult(::std::ostream* stream,
+                                   const TestResult& result);
 
   // Streams a JSON representation of a TestInfo object.
   static void OutputJsonTestInfo(::std::ostream* stream,
@@ -4028,7 +4489,9 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
   // The output file.
   const std::string output_file_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+  JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete;
+  JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) =
+      delete;
 };
 
 // Creates a new JsonUnitTestResultPrinter.
@@ -4040,7 +4503,7 @@ JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
 }
 
 void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
+                                                   int /*iteration*/) {
   FILE* jsonout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintJsonUnitTest(&stream, unit_test);
@@ -4106,55 +4569,90 @@ static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
     return "";
   // YYYY-MM-DDThh:mm:ss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "Z";
 }
 
 static inline std::string Indent(size_t width) {
   return std::string(width, ' ');
 }
 
-void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value,
-    const std::string& indent,
-    bool comma) {
+void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream,
+                                              const std::string& element_name,
+                                              const std::string& name,
+                                              const std::string& value,
+                                              const std::string& indent,
+                                              bool comma) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
-  if (comma)
-    *stream << ",\n";
+  if (comma) *stream << ",\n";
 }
 
 void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    int value,
-    const std::string& indent,
-    bool comma) {
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, int value, const std::string& indent, bool comma) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": " << StreamableToString(value);
-  if (comma)
-    *stream << ",\n";
+  if (comma) *stream << ",\n";
+}
+
+// Streams a test suite JSON stanza containing the given test result.
+void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a new test suite.
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
+  OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
+  if (!GTEST_FLAG_GET(list_tests)) {
+    OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
+    OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "errors", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "time",
+                  FormatTimeInMillisAsDuration(result.elapsed_time()),
+                  Indent(6));
+    OutputJsonKey(stream, "testsuite", "timestamp",
+                  FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                  Indent(6));
+  }
+  *stream << Indent(6) << "\"testsuite\": [\n";
+
+  // Output the boilerplate for a new test case.
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, "testcase", "name", "", Indent(10));
+  OutputJsonKey(stream, "testcase", "status", "RUN", Indent(10));
+  OutputJsonKey(stream, "testcase", "result", "COMPLETED", Indent(10));
+  OutputJsonKey(stream, "testcase", "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "classname", "", Indent(10), false);
+  *stream << TestPropertiesAsJson(result, Indent(10));
+
+  // Output the actual test result.
+  OutputJsonTestResult(stream, result);
+
+  // Finish the test suite.
+  *stream << "\n" << Indent(6) << "]\n" << Indent(4) << "}";
 }
 
 // Prints a JSON representation of a TestInfo object.
@@ -4176,11 +4674,14 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
     OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
                   kIndent);
   }
-  if (GTEST_FLAG(list_tests)) {
-    OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
-    OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+
+  OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+  if (GTEST_FLAG_GET(list_tests)) {
     *stream << "\n" << Indent(8) << "}";
     return;
+  } else {
+    *stream << ",\n";
   }
 
   OutputJsonKey(stream, kTestsuite, "status",
@@ -4199,13 +4700,22 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
                 false);
   *stream << TestPropertiesAsJson(result, kIndent);
 
+  OutputJsonTestResult(stream, result);
+}
+
+void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
+                                                     const TestResult& result) {
+  const std::string kIndent = Indent(10);
+
   int failures = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
     const TestPartResult& part = result.GetTestPartResult(i);
     if (part.failed()) {
       *stream << ",\n";
       if (++failures == 1) {
-        *stream << kIndent << "\"" << "failures" << "\": [\n";
+        *stream << kIndent << "\""
+                << "failures"
+                << "\": [\n";
       }
       const std::string location =
           internal::FormatCompilerIndependentFileLocation(part.file_name(),
@@ -4218,8 +4728,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
     }
   }
 
-  if (failures > 0)
-    *stream << "\n" << kIndent << "]";
+  if (failures > 0) *stream << "\n" << kIndent << "]";
   *stream << "\n" << Indent(8) << "}";
 }
 
@@ -4233,7 +4742,7 @@ void JsonUnitTestResultPrinter::PrintJsonTestSuite(
   OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
   OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
                 kIndent);
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputJsonKey(stream, kTestsuite, "failures",
                   test_suite.failed_test_count(), kIndent);
     OutputJsonKey(stream, kTestsuite, "disabled",
@@ -4280,7 +4789,7 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
   OutputJsonKey(stream, kTestsuites, "disabled",
                 unit_test.reportable_disabled_test_count(), kIndent);
   OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
                   kIndent);
   }
@@ -4309,7 +4818,15 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
     }
   }
 
-  *stream << "\n" << kIndent << "]\n" << "}\n";
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
 }
 
 void JsonUnitTestResultPrinter::PrintJsonTestList(
@@ -4344,7 +4861,8 @@ std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
   Message attributes;
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
-    attributes << ",\n" << indent << "\"" << property.key() << "\": "
+    attributes << ",\n"
+               << indent << "\"" << property.key() << "\": "
                << "\"" << EscapeJson(property.value()) << "\"";
   }
   return attributes.GetString();
@@ -4384,14 +4902,14 @@ void StreamingListener::SocketWriter::MakeConnection() {
 
   addrinfo hints;
   memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_family = AF_UNSPEC;  // To allow both IPv4 and IPv6 addresses.
   hints.ai_socktype = SOCK_STREAM;
   addrinfo* servinfo = nullptr;
 
   // Use the getaddrinfo() to get a linked list of IP addresses for
   // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  const int error_num =
+      getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
   if (error_num != 0) {
     GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
                         << gai_strerror(error_num);
@@ -4400,8 +4918,8 @@ void StreamingListener::SocketWriter::MakeConnection() {
   // Loop through all the results and connect to the first we can.
   for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
        cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
+                     cur_addr->ai_protocol);
     if (sockfd_ != -1) {
       // Connect the client socket to the server socket.
       if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
@@ -4451,7 +4969,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   for (int i = 0; i < raw_stack_size; ++i) {
     if (raw_stack[i] == caller_frame &&
-        !GTEST_FLAG(show_internal_stack_frames)) {
+        !GTEST_FLAG_GET(show_internal_stack_frames)) {
       // Add a marker to the trace and stop adding frames.
       absl::StrAppend(&result, kElidedFramesMarker, "\n");
       break;
@@ -4470,7 +4988,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   return result;
 
-#else  // !GTEST_HAS_ABSL
+#else   // !GTEST_HAS_ABSL
   static_cast<void>(max_depth);
   static_cast<void>(skip_count);
   return "";
@@ -4494,20 +5012,21 @@ void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
 class ScopedPrematureExitFile {
  public:
   explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath ?
-                                 premature_exit_filepath : "") {
+      : premature_exit_filepath_(
+            premature_exit_filepath ? premature_exit_filepath : "") {
     // If a path to the premature-exit file is specified...
     if (!premature_exit_filepath_.empty()) {
       // create the file with a single "0" character in it.  I/O
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
-      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w");
       fwrite("0", 1, 1, pfile);
       fclose(pfile);
     }
   }
 
   ~ScopedPrematureExitFile() {
+#if !defined GTEST_OS_ESP8266
     if (!premature_exit_filepath_.empty()) {
       int retval = remove(premature_exit_filepath_.c_str());
       if (retval) {
@@ -4516,12 +5035,14 @@ class ScopedPrematureExitFile {
                           << retval;
       }
     }
+#endif
   }
 
  private:
   const std::string premature_exit_filepath_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+  ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete;
+  ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete;
 };
 
 }  // namespace internal
@@ -4695,7 +5216,7 @@ int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
 // Gets the time of the test program start, in ms from the start of the
 // UNIX epoch.
 internal::TimeInMillis UnitTest::start_timestamp() const {
-    return impl()->start_timestamp();
+  return impl()->start_timestamp();
 }
 
 // Gets the elapsed time, in milliseconds.
@@ -4738,9 +5259,7 @@ TestSuite* UnitTest::GetMutableTestSuite(int i) {
 
 // Returns the list of event listeners that can be used to track events
 // inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
+TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); }
 
 // Registers and returns a global test environment.  When a test
 // program is run, all global test environments will be set-up in the
@@ -4765,12 +5284,11 @@ Environment* UnitTest::AddEnvironment(Environment* env) {
 // assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
 // this to report their results.  The user code should use the
 // assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(
-    TestPartResult::Type result_type,
-    const char* file_name,
-    int line_number,
-    const std::string& message,
-    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+                                 const char* file_name, int line_number,
+                                 const std::string& message,
+                                 const std::string& os_stack_trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   Message msg;
   msg << message;
 
@@ -4780,8 +5298,9 @@ void UnitTest::AddTestPartResult(
 
     for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
       const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
+      msg << "\n"
+          << internal::FormatFileLocation(trace.file, trace.line) << " "
+          << trace.message;
     }
   }
 
@@ -4791,8 +5310,8 @@ void UnitTest::AddTestPartResult(
 
   const TestPartResult result = TestPartResult(
       result_type, file_name, line_number, msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
+  impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      result);
 
   if (result_type != TestPartResult::kSuccess &&
       result_type != TestPartResult::kSkip) {
@@ -4801,7 +5320,7 @@ void UnitTest::AddTestPartResult(
     // in the code (perhaps in order to use Google Test assertions
     // with another testing framework) and specify the former on the
     // command line for debugging.
-    if (GTEST_FLAG(break_on_failure)) {
+    if (GTEST_FLAG_GET(break_on_failure)) {
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
       // Using DebugBreak on Windows allows gtest to still break into a debugger
       // when a failure happens and both the --gtest_break_on_failure and
@@ -4818,7 +5337,7 @@ void UnitTest::AddTestPartResult(
       // portability: some debuggers don't correctly trap abort().
       *static_cast<volatile int*>(nullptr) = 1;
 #endif  // GTEST_OS_WINDOWS
-    } else if (GTEST_FLAG(throw_on_failure)) {
+    } else if (GTEST_FLAG_GET(throw_on_failure)) {
 #if GTEST_HAS_EXCEPTIONS
       throw internal::GoogleTestFailureException(result);
 #else
@@ -4847,7 +5366,7 @@ void UnitTest::RecordProperty(const std::string& key,
 // from the main thread.
 int UnitTest::Run() {
   const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+      GTEST_FLAG_GET(internal_run_death_test).length() > 0;
 
   // Google Test implements this protocol for catching that a test
   // program exits before returning control to Google Test:
@@ -4877,7 +5396,7 @@ int UnitTest::Run() {
 
   // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
   // used for the duration of the program.
-  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+  impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions));
 
 #if GTEST_OS_WINDOWS
   // Either the user wants Google Test to catch exceptions thrown by the
@@ -4885,30 +5404,29 @@ int UnitTest::Run() {
   // process. In either case the user does not want to see pop-up dialogs
   // about crashes - they are expected.
   if (impl()->catch_exceptions() || in_death_test_child_process) {
-# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
     // SetErrorMode doesn't exist on CE.
     SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
                  SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
     // Death test children can be terminated with _abort().  On Windows,
     // _abort() can show a dialog with a warning message.  This forces the
     // abort message to go to stderr instead.
     _set_error_mode(_OUT_TO_STDERR);
-# endif
+#endif
 
-# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
     // In the debug version, Visual Studio pops up a separate dialog
     // offering a choice to debug the aborted program. We need to suppress
     // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
     // executed. Google Test will notify the user of any unexpected
     // failure via stderr.
-    if (!GTEST_FLAG(break_on_failure))
+    if (!GTEST_FLAG_GET(break_on_failure))
       _set_abort_behavior(
           0x0,                                    // Clear the following flags:
           _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
-# endif
 
     // In debug mode, the Windows CRT can crash with an assertion over invalid
     // input (e.g. passing an invalid file descriptor).  The default handling
@@ -4919,13 +5437,15 @@ int UnitTest::Run() {
                               _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
       (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
     }
+#endif
   }
 #endif  // GTEST_OS_WINDOWS
 
   return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+             impl(), &internal::UnitTestImpl::RunAllTests,
+             "auxiliary test code (environments or event listeners)")
+             ? 0
+             : 1;
 }
 
 // Returns the working directory when the first TEST() or TEST_F() was
@@ -4970,14 +5490,10 @@ UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
 }
 
 // Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
+UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
 
 // Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
+UnitTest::~UnitTest() { delete impl_; }
 
 // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
 // Google Test trace stack.
@@ -4988,8 +5504,7 @@ void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
 }
 
 // Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().pop_back();
 }
@@ -5086,12 +5601,12 @@ void UnitTestImpl::ConfigureXmlOutput() {
 // Initializes event listeners for streaming test results in string form.
 // Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureStreamingOutput() {
-  const std::string& target = GTEST_FLAG(stream_result_to);
+  const std::string& target = GTEST_FLAG_GET(stream_result_to);
   if (!target.empty()) {
     const size_t pos = target.find(':');
     if (pos != std::string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
+      listeners()->Append(
+          new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
     } else {
       GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
                           << "\" ignored.";
@@ -5129,13 +5644,17 @@ void UnitTestImpl::PostFlagParsingInit() {
     // to shut down the default XML output before invoking RUN_ALL_TESTS.
     ConfigureXmlOutput();
 
+    if (GTEST_FLAG_GET(brief)) {
+      listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
+    }
+
 #if GTEST_CAN_STREAM_RESULTS_
     // Configures listeners for streaming test results to the specified server.
     ConfigureStreamingOutput();
 #endif  // GTEST_CAN_STREAM_RESULTS_
 
 #if GTEST_HAS_ABSL
-    if (GTEST_FLAG(install_failure_signal_handler)) {
+    if (GTEST_FLAG_GET(install_failure_signal_handler)) {
       absl::FailureSignalHandlerOptions options;
       absl::InstallFailureSignalHandler(options);
     }
@@ -5174,10 +5693,10 @@ class TestSuiteNameIs {
 // Arguments:
 //
 //   test_suite_name: name of the test suite
-//   type_param:     the name of the test suite's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test suite.
-//   set_up_tc:      pointer to the function that sets up the test suite
-//   tear_down_tc:   pointer to the function that tears down the test suite
+//   type_param:      the name of the test suite's type parameter, or NULL if
+//                    this is not a typed or a type-parameterized test suite.
+//   set_up_tc:       pointer to the function that sets up the test suite
+//   tear_down_tc:    pointer to the function that tears down the test suite
 TestSuite* UnitTestImpl::GetTestSuite(
     const char* test_suite_name, const char* type_param,
     internal::SetUpTestSuiteFunc set_up_tc,
@@ -5193,9 +5712,9 @@ TestSuite* UnitTestImpl::GetTestSuite(
   auto* const new_test_suite =
       new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
 
+  const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter);
   // Is this a death test suite?
-  if (internal::UnitTestOptions::MatchesFilter(test_suite_name,
-                                               kDeathTestSuiteFilter)) {
+  if (death_test_suite_filter.MatchesName(test_suite_name)) {
     // Yes.  Inserts the test suite after the last death test suite
     // defined so far.  This only works when the test suites haven't
     // been shuffled.  Otherwise we may end up running a death test
@@ -5232,8 +5751,7 @@ bool UnitTestImpl::RunAllTests() {
   const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 
   // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
+  if (g_help_flag) return true;
 
   // Repeats the call to the post-flag parsing initialization in case the
   // user didn't call InitGoogleTest.
@@ -5251,11 +5769,11 @@ bool UnitTestImpl::RunAllTests() {
 #if GTEST_HAS_DEATH_TEST
   in_subprocess_for_death_test =
       (internal_run_death_test_flag_.get() != nullptr);
-# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
   if (in_subprocess_for_death_test) {
     GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
   }
-# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 #endif  // GTEST_HAS_DEATH_TEST
 
   const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -5263,19 +5781,18 @@ bool UnitTestImpl::RunAllTests() {
 
   // Compares the full test names with the filter to decide which
   // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+  const bool has_tests_to_run =
+      FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
+                               : IGNORE_SHARDING_PROTOCOL) > 0;
 
   // Lists the tests and exits if the --gtest_list_tests flag was specified.
-  if (GTEST_FLAG(list_tests)) {
+  if (GTEST_FLAG_GET(list_tests)) {
     // This must be called *after* FilterTests() has been called.
     ListTestsMatchingFilter();
     return true;
   }
 
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+  random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed));
 
   // True if and only if at least one test has failed.
   bool failed = false;
@@ -5287,19 +5804,31 @@ bool UnitTestImpl::RunAllTests() {
 
   // How many times to repeat the tests?  We don't want to repeat them
   // when we are inside the subprocess of a death test.
-  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat);
+
   // Repeats forever if the repeat count is negative.
   const bool gtest_repeat_forever = repeat < 0;
+
+  // Should test environments be set up and torn down for each repeat, or only
+  // set up on the first and torn down on the last iteration? If there is no
+  // "last" iteration because the tests will repeat forever, always recreate the
+  // environments to avoid leaks in case one of the environments is using
+  // resources that are external to this process. Without this check there would
+  // be no way to clean up those external resources automatically.
+  const bool recreate_environments_when_repeating =
+      GTEST_FLAG_GET(recreate_environments_when_repeating) ||
+      gtest_repeat_forever;
+
   for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
     // We want to preserve failures generated by ad-hoc test
     // assertions executed before RUN_ALL_TESTS().
     ClearNonAdHocTestResult();
 
-    const TimeInMillis start = GetTimeInMillis();
+    Timer timer;
 
     // Shuffles test suites and tests if requested.
-    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
-      random()->Reseed(static_cast<UInt32>(random_seed_));
+    if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) {
+      random()->Reseed(static_cast<uint32_t>(random_seed_));
       // This should be done before calling OnTestIterationStart(),
       // such that a test event listener can see the actual test order
       // in the event.
@@ -5311,10 +5840,13 @@ bool UnitTestImpl::RunAllTests() {
 
     // Runs each test suite if there is at least one test to run.
     if (has_tests_to_run) {
-      // Sets up all environments beforehand.
-      repeater->OnEnvironmentsSetUpStart(*parent_);
-      ForEach(environments_, SetUpEnvironment);
-      repeater->OnEnvironmentsSetUpEnd(*parent_);
+      // Sets up all environments beforehand. If test environments aren't
+      // recreated for each iteration, only do so on the first iteration.
+      if (i == 0 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsSetUpStart(*parent_);
+        ForEach(environments_, SetUpEnvironment);
+        repeater->OnEnvironmentsSetUpEnd(*parent_);
+      }
 
       // Runs the tests only if there was no fatal failure or skip triggered
       // during global set-up.
@@ -5336,17 +5868,36 @@ bool UnitTestImpl::RunAllTests() {
         for (int test_index = 0; test_index < total_test_suite_count();
              test_index++) {
           GetMutableSuiteCase(test_index)->Run();
+          if (GTEST_FLAG_GET(fail_fast) &&
+              GetMutableSuiteCase(test_index)->Failed()) {
+            for (int j = test_index + 1; j < total_test_suite_count(); j++) {
+              GetMutableSuiteCase(j)->Skip();
+            }
+            break;
+          }
+        }
+      } else if (Test::HasFatalFailure()) {
+        // If there was a fatal failure during the global setup then we know we
+        // aren't going to run any tests. Explicitly mark all of the tests as
+        // skipped to make this obvious in the output.
+        for (int test_index = 0; test_index < total_test_suite_count();
+             test_index++) {
+          GetMutableSuiteCase(test_index)->Skip();
         }
       }
 
-      // Tears down all environments in reverse order afterwards.
-      repeater->OnEnvironmentsTearDownStart(*parent_);
-      std::for_each(environments_.rbegin(), environments_.rend(),
-                    TearDownEnvironment);
-      repeater->OnEnvironmentsTearDownEnd(*parent_);
+      // Tears down all environments in reverse order afterwards. If test
+      // environments aren't recreated for each iteration, only do so on the
+      // last iteration.
+      if (i == repeat - 1 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsTearDownStart(*parent_);
+        std::for_each(environments_.rbegin(), environments_.rend(),
+                      TearDownEnvironment);
+        repeater->OnEnvironmentsTearDownEnd(*parent_);
+      }
     }
 
-    elapsed_time_ = GetTimeInMillis() - start;
+    elapsed_time_ = timer.Elapsed();
 
     // Tells the unit test event listener that the tests have just finished.
     repeater->OnTestIterationEnd(*parent_, i);
@@ -5364,7 +5915,7 @@ bool UnitTestImpl::RunAllTests() {
     // (it's always safe to unshuffle the tests).
     UnshuffleTests();
 
-    if (GTEST_FLAG(shuffle)) {
+    if (GTEST_FLAG_GET(shuffle)) {
       // Picks a new random seed for each iteration.
       random_seed_ = GetNextRandomSeed(random_seed_);
     }
@@ -5374,14 +5925,14 @@ bool UnitTestImpl::RunAllTests() {
 
   if (!gtest_is_initialized_before_run_all_tests) {
     ColoredPrintf(
-        COLOR_RED,
+        GTestColor::kRed,
         "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
         "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
         "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
         " will start to enforce the valid usage. "
         "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
 #if GTEST_FOR_GOOGLE_
-    ColoredPrintf(COLOR_RED,
+    ColoredPrintf(GTestColor::kRed,
                   "For more details, see http://wiki/Main/ValidGUnitMain.\n");
 #endif  // GTEST_FOR_GOOGLE_
   }
@@ -5398,7 +5949,7 @@ void WriteToShardStatusFileIfNeeded() {
   if (test_shard_file != nullptr) {
     FILE* const file = posix::FOpen(test_shard_file, "w");
     if (file == nullptr) {
-      ColoredPrintf(COLOR_RED,
+      ColoredPrintf(GTestColor::kRed,
                     "Could not write to the test shard status file \"%s\" "
                     "specified by the %s environment variable.\n",
                     test_shard_file, kTestShardStatusFile);
@@ -5415,41 +5966,40 @@ void WriteToShardStatusFileIfNeeded() {
 // an error and exits. If in_subprocess_for_death_test, sharding is
 // disabled because it must only be applied to the original test
 // process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
+bool ShouldShard(const char* total_shards_env, const char* shard_index_env,
                  bool in_subprocess_for_death_test) {
   if (in_subprocess_for_death_test) {
     return false;
   }
 
-  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
-  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+  const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1);
 
   if (total_shards == -1 && shard_index == -1) {
     return false;
   } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    const Message msg = Message() << "Invalid environment variables: you have "
+                                  << kTestShardIndex << " = " << shard_index
+                                  << ", but have left " << kTestTotalShards
+                                  << " unset.\n";
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (total_shards != -1 && shard_index == -1) {
     const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+                        << "Invalid environment variables: you have "
+                        << kTestTotalShards << " = " << total_shards
+                        << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    const Message msg =
+        Message() << "Invalid environment variables: we require 0 <= "
+                  << kTestShardIndex << " < " << kTestTotalShards
+                  << ", but you have " << kTestShardIndex << "=" << shard_index
+                  << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   }
@@ -5460,13 +6010,13 @@ bool ShouldShard(const char* total_shards_env,
 // Parses the environment variable var as an Int32. If it is unset,
 // returns default_val. If it is not an Int32, prints an error
 // and aborts.
-Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+int32_t Int32FromEnvOrDie(const char* var, int32_t default_val) {
   const char* str_val = posix::GetEnv(var);
   if (str_val == nullptr) {
     return default_val;
   }
 
-  Int32 result;
+  int32_t result;
   if (!ParseInt32(Message() << "The value of environment variable " << var,
                   str_val, &result)) {
     exit(EXIT_FAILURE);
@@ -5490,11 +6040,16 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
 // https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
 // . Returns the number of tests that should run.
 int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
+  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
+                                   ? Int32FromEnvOrDie(kTestTotalShards, -1)
+                                   : -1;
+  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
+                                  ? Int32FromEnvOrDie(kTestShardIndex, -1)
+                                  : -1;
+
+  const PositiveAndNegativeUnitTestFilter gtest_flag_filter(
+      GTEST_FLAG_GET(filter));
+  const UnitTestFilter disable_test_filter(kDisableTestFilter);
   // num_runnable_tests are the number of tests that will
   // run across all shards (i.e., match filter and are not disabled).
   // num_selected_tests are the number of tests to be run on
@@ -5510,18 +6065,17 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
       const std::string test_name(test_info->name());
       // A test is disabled if test suite name or test name matches
       // kDisableTestFilter.
-      const bool is_disabled = internal::UnitTestOptions::MatchesFilter(
-                                   test_suite_name, kDisableTestFilter) ||
-                               internal::UnitTestOptions::MatchesFilter(
-                                   test_name, kDisableTestFilter);
+      const bool is_disabled =
+          disable_test_filter.MatchesName(test_suite_name) ||
+          disable_test_filter.MatchesName(test_name);
       test_info->is_disabled_ = is_disabled;
 
-      const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest(
-          test_suite_name, test_name);
+      const bool matches_filter =
+          gtest_flag_filter.MatchesTest(test_suite_name, test_name);
       test_info->matches_filter_ = matches_filter;
 
       const bool is_runnable =
-          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) &&
           matches_filter;
 
       const bool is_in_another_shard =
@@ -5690,8 +6244,8 @@ void UnitTestImpl::UnshuffleTests() {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                            int skip_count) {
+GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string
+GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) {
   // We pass skip_count + 1 to skip this wrapper function in addition
   // to what the user really wants to skip.
   return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
@@ -5701,7 +6255,7 @@ std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
 // suppress unreachable code warnings.
 namespace {
 class ClassUniqueToAlwaysTrue {};
-}
+}  // namespace
 
 bool IsTrue(bool condition) { return condition; }
 
@@ -5709,8 +6263,7 @@ bool AlwaysTrue() {
 #if GTEST_HAS_EXCEPTIONS
   // This condition is always false so AlwaysTrue() never actually throws,
   // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
+  if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
 #endif  // GTEST_HAS_EXCEPTIONS
   return true;
 }
@@ -5732,13 +6285,14 @@ bool SkipPrefix(const char* prefix, const char** pstr) {
 // part can be omitted.
 //
 // Returns the value of the flag, or NULL if the parsing failed.
-static const char* ParseFlagValue(const char* str, const char* flag,
+static const char* ParseFlagValue(const char* str, const char* flag_name,
                                   bool def_optional) {
   // str and flag must not be NULL.
-  if (str == nullptr || flag == nullptr) return nullptr;
+  if (str == nullptr || flag_name == nullptr) return nullptr;
 
   // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const std::string flag_str =
+      std::string("--") + GTEST_FLAG_PREFIX_ + flag_name;
   const size_t flag_len = flag_str.length();
   if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
 
@@ -5769,9 +6323,9 @@ static const char* ParseFlagValue(const char* str, const char* flag,
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+static bool ParseFlag(const char* str, const char* flag_name, bool* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
+  const char* const value_str = ParseFlagValue(str, flag_name, true);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -5781,32 +6335,30 @@ static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   return true;
 }
 
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
+// Parses a string for an int32_t flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+bool ParseFlag(const char* str, const char* flag_name, int32_t* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
 
   // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
+  return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
+                    value);
 }
 
-// Parses a string for a string flag, in the form of
-// "--flag=value".
+// Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 template <typename String>
-static bool ParseStringFlag(const char* str, const char* flag, String* value) {
+static bool ParseFlag(const char* str, const char* flag_name, String* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -5823,8 +6375,7 @@ static bool ParseStringFlag(const char* str, const char* flag, String* value) {
 // GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
 // internal flags and do not trigger the help message.
 static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
+  return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
           SkipPrefix("/", &str)) &&
          !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
          (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
@@ -5841,7 +6392,7 @@ static bool HasGoogleTestFlagPrefix(const char* str) {
 //   @D    changes to the default terminal text color.
 //
 static void PrintColorEncoded(const char* str) {
-  GTestColor color = COLOR_DEFAULT;  // The current color.
+  GTestColor color = GTestColor::kDefault;  // The current color.
 
   // Conceptually, we split the string into segments divided by escape
   // sequences.  Then we print one segment at a time.  At the end of
@@ -5861,13 +6412,13 @@ static void PrintColorEncoded(const char* str) {
     if (ch == '@') {
       ColoredPrintf(color, "@");
     } else if (ch == 'D') {
-      color = COLOR_DEFAULT;
+      color = GTestColor::kDefault;
     } else if (ch == 'R') {
-      color = COLOR_RED;
+      color = GTestColor::kRed;
     } else if (ch == 'G') {
-      color = COLOR_GREEN;
+      color = GTestColor::kGreen;
     } else if (ch == 'Y') {
-      color = COLOR_YELLOW;
+      color = GTestColor::kYellow;
     } else {
       --str;
     }
@@ -5875,105 +6426,140 @@ static void PrintColorEncoded(const char* str) {
 }
 
 static const char kColorEncodedHelpMessage[] =
-"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
-"following command line flags to control its behavior:\n"
-"\n"
-"Test Selection:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
-"      List the names of all tests instead of running them. The name of\n"
-"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
-"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "This program contains tests written using " GTEST_NAME_
+    ". You can use the\n"
+    "following command line flags to control its behavior:\n"
+    "\n"
+    "Test Selection:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D\n"
+    "      List the names of all tests instead of running them. The name of\n"
+    "      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "filter=@YPOSITIVE_PATTERNS"
     "[@G-@YNEGATIVE_PATTERNS]@D\n"
-"      Run only the tests whose name matches one of the positive patterns but\n"
-"      none of the negative patterns. '?' matches any single character; '*'\n"
-"      matches any substring; ':' separates two patterns.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
-"      Run all disabled tests too.\n"
-"\n"
-"Test Execution:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
-"      Run the tests repeatedly; use a negative count to repeat forever.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
-"      Randomize tests' orders on every iteration.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
-"      Random number seed to use for shuffling test orders (between 1 and\n"
-"      99999, or 0 to use a seed based on the current time).\n"
-"\n"
-"Test Output:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
-"      Enable/disable colored output. The default is @Gauto@D.\n"
-"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
-"      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G"
-    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate a JSON or XML report in the given directory or with the given\n"
-"      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
-# if GTEST_CAN_STREAM_RESULTS_
-"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
-"      Stream test results to the given server.\n"
-# endif  // GTEST_CAN_STREAM_RESULTS_
-"\n"
-"Assertion Behavior:\n"
-# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
-"      Set the default death test style.\n"
-# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
-"      Turn assertion failures into debugger break-points.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions for use by an external\n"
-"      test framework.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
-"      Do not report exceptions as test failures. Instead, allow them\n"
-"      to crash the program or throw a pop-up (on Windows).\n"
-"\n"
-"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "      Run only the tests whose name matches one of the positive patterns "
+    "but\n"
+    "      none of the negative patterns. '?' matches any single character; "
+    "'*'\n"
+    "      matches any substring; ':' separates two patterns.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "also_run_disabled_tests@D\n"
+    "      Run all disabled tests too.\n"
+    "\n"
+    "Test Execution:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "repeat=@Y[COUNT]@D\n"
+    "      Run the tests repeatedly; use a negative count to repeat forever.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "shuffle@D\n"
+    "      Randomize tests' orders on every iteration.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "random_seed=@Y[NUMBER]@D\n"
+    "      Random number seed to use for shuffling test orders (between 1 and\n"
+    "      99999, or 0 to use a seed based on the current time).\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "recreate_environments_when_repeating@D\n"
+    "      Sets up and tears down the global test environment on each repeat\n"
+    "      of the test.\n"
+    "\n"
+    "Test Output:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+    "      Enable/disable colored output. The default is @Gauto@D.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "brief=1@D\n"
+    "      Only print test failures.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "print_time=0@D\n"
+    "      Don't print the elapsed time of each test.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_
+    "@Y|@G:@YFILE_PATH]@D\n"
+    "      Generate a JSON or XML report in the given directory or with the "
+    "given\n"
+    "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+    "  @G--" GTEST_FLAG_PREFIX_
+    "stream_result_to=@YHOST@G:@YPORT@D\n"
+    "      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+    "\n"
+    "Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+    "  @G--" GTEST_FLAG_PREFIX_
+    "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+    "      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+    "  @G--" GTEST_FLAG_PREFIX_
+    "break_on_failure@D\n"
+    "      Turn assertion failures into debugger break-points.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "throw_on_failure@D\n"
+    "      Turn assertion failures into C++ exceptions for use by an external\n"
+    "      test framework.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "catch_exceptions=0@D\n"
+    "      Do not report exceptions as test failures. Instead, allow them\n"
+    "      to crash the program or throw a pop-up (on Windows).\n"
+    "\n"
+    "Except for @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D, you can alternatively set "
     "the corresponding\n"
-"environment variable of a flag (all letters in upper-case). For example, to\n"
-"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "environment variable of a flag (all letters in upper-case). For example, "
+    "to\n"
+    "disable colored text output, you can either specify "
+    "@G--" GTEST_FLAG_PREFIX_
     "color=no@D or set\n"
-"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
-"\n"
-"For more information, please read the " GTEST_NAME_ " documentation at\n"
-"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
-"(not one in your own code or tests), please report it to\n"
-"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+    "the @G" GTEST_FLAG_PREFIX_UPPER_
+    "COLOR@D environment variable to @Gno@D.\n"
+    "\n"
+    "For more information, please read the " GTEST_NAME_
+    " documentation at\n"
+    "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_
+    "\n"
+    "(not one in your own code or tests), please report it to\n"
+    "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
 static bool ParseGoogleTestFlag(const char* const arg) {
-  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                       &GTEST_FLAG(also_run_disabled_tests)) ||
-      ParseBoolFlag(arg, kBreakOnFailureFlag,
-                    &GTEST_FLAG(break_on_failure)) ||
-      ParseBoolFlag(arg, kCatchExceptionsFlag,
-                    &GTEST_FLAG(catch_exceptions)) ||
-      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-      ParseStringFlag(arg, kDeathTestStyleFlag,
-                      &GTEST_FLAG(death_test_style)) ||
-      ParseBoolFlag(arg, kDeathTestUseFork,
-                    &GTEST_FLAG(death_test_use_fork)) ||
-      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-      ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                      &GTEST_FLAG(internal_run_death_test)) ||
-      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-      ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
-      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-      ParseInt32Flag(arg, kStackTraceDepthFlag,
-                     &GTEST_FLAG(stack_trace_depth)) ||
-      ParseStringFlag(arg, kStreamResultToFlag,
-                      &GTEST_FLAG(stream_result_to)) ||
-      ParseBoolFlag(arg, kThrowOnFailureFlag,
-                    &GTEST_FLAG(throw_on_failure));
+#define GTEST_INTERNAL_PARSE_FLAG(flag_name)  \
+  do {                                        \
+    auto value = GTEST_FLAG_GET(flag_name);   \
+    if (ParseFlag(arg, #flag_name, &value)) { \
+      GTEST_FLAG_SET(flag_name, value);       \
+      return true;                            \
+    }                                         \
+  } while (false)
+
+  GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests);
+  GTEST_INTERNAL_PARSE_FLAG(break_on_failure);
+  GTEST_INTERNAL_PARSE_FLAG(catch_exceptions);
+  GTEST_INTERNAL_PARSE_FLAG(color);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_style);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork);
+  GTEST_INTERNAL_PARSE_FLAG(fail_fast);
+  GTEST_INTERNAL_PARSE_FLAG(filter);
+  GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test);
+  GTEST_INTERNAL_PARSE_FLAG(list_tests);
+  GTEST_INTERNAL_PARSE_FLAG(output);
+  GTEST_INTERNAL_PARSE_FLAG(brief);
+  GTEST_INTERNAL_PARSE_FLAG(print_time);
+  GTEST_INTERNAL_PARSE_FLAG(print_utf8);
+  GTEST_INTERNAL_PARSE_FLAG(random_seed);
+  GTEST_INTERNAL_PARSE_FLAG(repeat);
+  GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating);
+  GTEST_INTERNAL_PARSE_FLAG(shuffle);
+  GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth);
+  GTEST_INTERNAL_PARSE_FLAG(stream_result_to);
+  GTEST_INTERNAL_PARSE_FLAG(throw_on_failure);
+  return false;
 }
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 static void LoadFlagsFromFile(const std::string& path) {
   FILE* flagfile = posix::FOpen(path.c_str(), "r");
   if (!flagfile) {
-    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile)
                       << "\"";
   }
   std::string contents(ReadEntireFile(flagfile));
@@ -5981,10 +6567,8 @@ static void LoadFlagsFromFile(const std::string& path) {
   std::vector<std::string> lines;
   SplitString(contents, '\n', &lines);
   for (size_t i = 0; i < lines.size(); ++i) {
-    if (lines[i].empty())
-      continue;
-    if (!ParseGoogleTestFlag(lines[i].c_str()))
-      g_help_flag = true;
+    if (lines[i].empty()) continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
   }
 }
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
@@ -5994,25 +6578,23 @@ static void LoadFlagsFromFile(const std::string& path) {
 // instantiated to either char or wchar_t.
 template <typename CharType>
 void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  std::string flagfile_value;
   for (int i = 1; i < *argc; i++) {
     const std::string arg_string = StreamableToString(argv[i]);
     const char* const arg = arg_string.c_str();
 
-    using internal::ParseBoolFlag;
-    using internal::ParseInt32Flag;
-    using internal::ParseStringFlag;
+    using internal::ParseFlag;
 
     bool remove_flag = false;
     if (ParseGoogleTestFlag(arg)) {
       remove_flag = true;
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
-      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+    } else if (ParseFlag(arg, "flagfile", &flagfile_value)) {
+      GTEST_FLAG_SET(flagfile, flagfile_value);
+      LoadFlagsFromFile(flagfile_value);
       remove_flag = true;
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
+    } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) {
       // Both help flag and unrecognized Google Test flags (excluding
       // internal ones) trigger help display.
       g_help_flag = true;
@@ -6047,7 +6629,27 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.
 void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+#if GTEST_HAS_ABSL
+  if (*argc > 0) {
+    // absl::ParseCommandLine() requires *argc > 0.
+    auto positional_args = absl::flags_internal::ParseCommandLineImpl(
+        *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs,
+        absl::flags_internal::UsageFlagsAction::kHandleUsage,
+        absl::flags_internal::OnUndefinedFlag::kReportUndefined);
+    // Any command-line positional arguments not part of any command-line flag
+    // (or arguments to a flag) are copied back out to argv, with the program
+    // invocation name at position 0, and argc is resized. This includes
+    // positional arguments after the flag-terminating delimiter '--'.
+    // See https://abseil.io/docs/cpp/guides/flags.
+    std::copy(positional_args.begin(), positional_args.end(), argv);
+    if (static_cast<int>(positional_args.size()) < *argc) {
+      argv[positional_args.size()] = nullptr;
+      *argc = static_cast<int>(positional_args.size());
+    }
+  }
+#else
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
+#endif
 
   // Fix the value of *_NSGetArgc() on macOS, but if and only if
   // *_NSGetArgv() == argv
@@ -6082,6 +6684,12 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 
 #if GTEST_HAS_ABSL
   absl::InitializeSymbolizer(g_argvs[0].c_str());
+
+  // When using the Abseil Flags library, set the program usage message to the
+  // help message, but remove the color-encoding from the message first.
+  absl::SetProgramUsageMessage(absl::StrReplaceAll(
+      kColorEncodedHelpMessage,
+      {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}}));
 #endif  // GTEST_HAS_ABSL
 
   ParseGoogleTestFlagsOnly(argc, argv);
@@ -6102,7 +6710,7 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 void InitGoogleTest(int* argc, char** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6112,7 +6720,7 @@ void InitGoogleTest(int* argc, char** argv) {
 void InitGoogleTest(int* argc, wchar_t** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6128,31 +6736,42 @@ void InitGoogleTest() {
 
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(&argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
+#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+// Return value of first environment variable that is set and contains
+// a non-empty string. If there are none, return the "fallback" string.
+// Since we like the temporary directory to have a directory separator suffix,
+// add it if not provided in the environment variable value.
+static std::string GetTempDirFromEnv(
+    std::initializer_list<const char*> environment_variables,
+    const char* fallback, char separator) {
+  for (const char* variable_name : environment_variables) {
+    const char* value = internal::posix::GetEnv(variable_name);
+    if (value != nullptr && value[0] != '\0') {
+      if (value[strlen(value) - 1] != separator) {
+        return std::string(value).append(1, separator);
+      }
+      return value;
+    }
+  }
+  return fallback;
+}
+#endif
+
 std::string TempDir() {
 #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
   return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
-  return "\\temp\\";
-#elif GTEST_OS_WINDOWS
-  const char* temp_dir = internal::posix::GetEnv("TEMP");
-  if (temp_dir == nullptr || temp_dir[0] == '\0')
-    return "\\temp\\";
-  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
-    return temp_dir;
-  else
-    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\');
 #elif GTEST_OS_LINUX_ANDROID
-  return "/sdcard/";
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/');
 #else
-  return "/tmp/";
-#endif  // GTEST_OS_WINDOWS_MOBILE
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/');
+#endif
 }
 
 // Class ScopedTrace
@@ -6169,8 +6788,7 @@ void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
 }
 
 // Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
   UnitTest::GetInstance()->PopGTestTrace();
 }
 
diff --git a/third_party/googletest/src/src/gtest_main.cc b/third_party/googletest/src/src/gtest_main.cc
index f6e1dd96fb..44976375c9 100644
--- a/third_party/googletest/src/src/gtest_main.cc
+++ b/third_party/googletest/src/src/gtest_main.cc
@@ -28,15 +28,21 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <cstdio>
+
 #include "gtest/gtest.h"
 
-#ifdef ARDUINO
-void setup() {
-  testing::InitGoogleTest();
-}
+#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_ESP8266
+extern "C" {
+#endif
+void setup() { testing::InitGoogleTest(); }
 
 void loop() { RUN_ALL_TESTS(); }
 
+#if GTEST_OS_ESP8266
+}
+#endif
+
 #else
 
 GTEST_API_ int main(int argc, char **argv) {
diff --git a/third_party/libwebm/AUTHORS.TXT b/third_party/libwebm/AUTHORS.TXT
index 9686ac13eb..59b648ca68 100644
--- a/third_party/libwebm/AUTHORS.TXT
+++ b/third_party/libwebm/AUTHORS.TXT
@@ -2,3 +2,4 @@
 # Name or Organization <email address>
 
 Google Inc.
+Elijah Cirioli <eli.cirioli@gmail.com>
diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk
index b46ba101d4..b02795ccae 100644
--- a/third_party/libwebm/Android.mk
+++ b/third_party/libwebm/Android.mk
@@ -1,9 +1,11 @@
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 LOCAL_PATH:= $(call my-dir)
 
 include $(CLEAR_VARS)
 LOCAL_MODULE:= libwebm
 LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
-LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=gnu++11
 LOCAL_C_INCLUDES:= $(LOCAL_PATH)
 LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
 
@@ -14,4 +16,8 @@ LOCAL_SRC_FILES:= common/file_util.cc \
                   mkvmuxer/mkvmuxer.cc \
                   mkvmuxer/mkvmuxerutil.cc \
                   mkvmuxer/mkvwriter.cc
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT
 include $(BUILD_STATIC_LIBRARY)
+endif  # NDK_ROOT
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index 1e87afd3d1..a79b982ef4 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,7 +1,7 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 37d9b860ebbf40cb0f6dcb7a6fef452d798062da
+Version: 1930e3ca23b007f3ff11d98a570077be6201957e
 License: BSD
-License File: LICENSE.txt
+License File: LICENSE.TXT
 
 Description:
 libwebm is used to handle WebM container I/O.
@@ -18,3 +18,4 @@ Only keep:
  - mkvmuxer/
  - mkvparser/
  - PATENTS.TXT
+- use -std=gnu++11 in Android.mk (https://crbug.com/webm/1708)
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 5120312119..faaf0165f4 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -607,10 +607,10 @@ bool ContentEncoding::Write(IMkvWriter* writer) const {
   return true;
 }
 
-uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size,
+uint64_t ContentEncoding::EncodingSize(uint64_t compression_size,
                                        uint64_t encryption_size) const {
   // TODO(fgalligan): Add support for compression settings.
-  if (compresion_size != 0)
+  if (compression_size != 0)
     return 0;
 
   uint64_t encoding_size = 0;
@@ -774,7 +774,7 @@ bool Track::Write(IMkvWriter* writer) const {
     return false;
 
   // AV1 tracks require a CodecPrivate. See
-  // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md
+  // https://github.com/ietf-wg-cellar/matroska-specification/blob/HEAD/codec/av1.md
   // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
   // point to a stable version once it is finalized, or our own WebM mappings
   // page on webmproject.org should we decide to release them.
@@ -3084,6 +3084,7 @@ Segment::Segment()
       accurate_cluster_duration_(false),
       fixed_size_cluster_timecode_(false),
       estimate_file_duration_(false),
+      ebml_header_size_(0),
       payload_pos_(0),
       size_position_(0),
       doc_type_version_(kDefaultDocTypeVersion),
@@ -4105,12 +4106,16 @@ int Segment::WriteFramesAll() {
     // places where |doc_type_version_| needs to be updated.
     if (frame->discard_padding() != 0)
       doc_type_version_ = 4;
-    if (!cluster->AddFrame(frame))
-      return -1;
+    if (!cluster->AddFrame(frame)) {
+      delete frame;
+      continue;
+    }
 
     if (new_cuepoint_ && cues_track_ == frame->track_number()) {
-      if (!AddCuePoint(frame->timestamp(), cues_track_))
-        return -1;
+      if (!AddCuePoint(frame->timestamp(), cues_track_)) {
+        delete frame;
+        continue;
+      }
     }
 
     if (frame->timestamp() > last_timestamp_) {
@@ -4153,12 +4158,16 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) {
       const Frame* const frame_prev = frames_[i - 1];
       if (frame_prev->discard_padding() != 0)
         doc_type_version_ = 4;
-      if (!cluster->AddFrame(frame_prev))
-        return false;
+      if (!cluster->AddFrame(frame_prev)) {
+        delete frame_prev;
+        continue;
+      }
 
       if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
-        if (!AddCuePoint(frame_prev->timestamp(), cues_track_))
-          return false;
+        if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) {
+          delete frame_prev;
+          continue;
+        }
       }
 
       ++shift_left;
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.h b/third_party/libwebm/mkvmuxer/mkvmuxer.h
index f2db377145..8602d82325 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -330,7 +330,7 @@ class ContentEncoding {
 
  private:
   // Returns the size in bytes for the encoding elements.
-  uint64_t EncodingSize(uint64_t compresion_size,
+  uint64_t EncodingSize(uint64_t compression_size,
                         uint64_t encryption_size) const;
 
   // Returns the size in bytes for the encryption elements.
@@ -1425,7 +1425,7 @@ class SeekHead {
   bool Write(IMkvWriter* writer);
 
   // We are going to put a cap on the number of Seek Entries.
-  const static int32_t kSeekEntryCount = 5;
+  constexpr static int32_t kSeekEntryCount = 5;
 
  private:
   // Returns the maximum size in bytes of one seek entry.
@@ -1505,8 +1505,8 @@ class Segment {
     kBeforeClusters = 0x1  // Position Cues before Clusters
   };
 
-  static const uint32_t kDefaultDocTypeVersion = 4;
-  static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
+  static constexpr uint32_t kDefaultDocTypeVersion = 4;
+  static constexpr uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
 
   Segment();
   ~Segment();
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 6436817c9b..300b155797 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -606,7 +606,7 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
 
 void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
   *major = 0;
-  *minor = 2;
+  *minor = 3;
   *build = 1;
   *revision = 0;
 }
diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc
index ace65bd595..868afcb3ed 100644
--- a/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/third_party/libwebm/mkvparser/mkvparser.cc
@@ -54,9 +54,9 @@ Type* SafeArrayAlloc(unsigned long long num_elements,
 
 void GetVersion(int& major, int& minor, int& build, int& revision) {
   major = 1;
-  minor = 0;
-  build = 0;
-  revision = 30;
+  minor = 1;
+  build = 1;
+  revision = 0;
 }
 
 long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
@@ -298,7 +298,7 @@ long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
   if (status < 0)
     return status;
 
-  unsigned long long result = first_byte;
+  unsigned long long result = static_cast<unsigned long long>(first_byte);
   ++pos;
 
   for (long i = 1; i < size; ++i) {
@@ -1502,8 +1502,8 @@ long SeekHead::Parse() {
 
   // first count the seek head entries
 
-  int entry_count = 0;
-  int void_element_count = 0;
+  long long entry_count = 0;
+  long long void_element_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -1513,10 +1513,15 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvSeek)
+    if (id == libwebm::kMkvSeek) {
       ++entry_count;
-    else if (id == libwebm::kMkvVoid)
+      if (entry_count > INT_MAX)
+        return E_PARSE_FAILED;
+    } else if (id == libwebm::kMkvVoid) {
       ++void_element_count;
+      if (void_element_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
 
@@ -1528,14 +1533,15 @@ long SeekHead::Parse() {
     return E_FILE_FORMAT_INVALID;
 
   if (entry_count > 0) {
-    m_entries = new (std::nothrow) Entry[entry_count];
+    m_entries = new (std::nothrow) Entry[static_cast<size_t>(entry_count)];
 
     if (m_entries == NULL)
       return -1;
   }
 
   if (void_element_count > 0) {
-    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+    m_void_elements =
+        new (std::nothrow) VoidElement[static_cast<size_t>(void_element_count)];
 
     if (m_void_elements == NULL)
       return -1;
@@ -1582,13 +1588,13 @@ long SeekHead::Parse() {
 
   ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
   assert(count_ >= 0);
-  assert(count_ <= entry_count);
+  assert(static_cast<long long>(count_) <= entry_count);
 
   m_entry_count = static_cast<int>(count_);
 
   count_ = ptrdiff_t(pVoidElement - m_void_elements);
   assert(count_ >= 0);
-  assert(count_ <= void_element_count);
+  assert(static_cast<long long>(count_) <= void_element_count);
 
   m_void_element_count = static_cast<int>(count_);
 
@@ -2299,7 +2305,7 @@ bool CuePoint::Load(IMkvReader* pReader) {
   long long pos = pos_;
 
   // First count number of track positions
-
+  unsigned long long track_positions_count = 0;
   while (pos < stop) {
     long len;
 
@@ -2323,12 +2329,17 @@ bool CuePoint::Load(IMkvReader* pReader) {
     if (id == libwebm::kMkvCueTime)
       m_timecode = UnserializeUInt(pReader, pos, size);
 
-    else if (id == libwebm::kMkvCueTrackPositions)
-      ++m_track_positions_count;
+    else if (id == libwebm::kMkvCueTrackPositions) {
+      ++track_positions_count;
+      if (track_positions_count > UINT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
   }
 
+  m_track_positions_count = static_cast<size_t>(track_positions_count);
+
   if (m_timecode < 0 || m_track_positions_count <= 0) {
     return false;
   }
@@ -2421,7 +2432,7 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
     pos += size;  // consume payload
   }
 
-  if ((m_pos < 0) || (m_track <= 0)) {
+  if ((m_pos < 0) || (m_track <= 0) || (m_block < 0) || (m_block > LONG_MAX)) {
     return false;
   }
 
@@ -4194,8 +4205,8 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
   const long long stop = start + size;
 
   // Count ContentCompression and ContentEncryption elements.
-  int compression_count = 0;
-  int encryption_count = 0;
+  long long compression_count = 0;
+  long long encryption_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -4203,11 +4214,17 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvContentCompression)
+    if (id == libwebm::kMkvContentCompression) {
       ++compression_count;
+      if (compression_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
-    if (id == libwebm::kMkvContentEncryption)
+    if (id == libwebm::kMkvContentEncryption) {
       ++encryption_count;
+      if (encryption_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4218,16 +4235,16 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     return -1;
 
   if (compression_count > 0) {
-    compression_entries_ =
-        new (std::nothrow) ContentCompression*[compression_count];
+    compression_entries_ = new (std::nothrow)
+        ContentCompression*[static_cast<size_t>(compression_count)];
     if (!compression_entries_)
       return -1;
     compression_entries_end_ = compression_entries_;
   }
 
   if (encryption_count > 0) {
-    encryption_entries_ =
-        new (std::nothrow) ContentEncryption*[encryption_count];
+    encryption_entries_ = new (std::nothrow)
+        ContentEncryption*[static_cast<size_t>(encryption_count)];
     if (!encryption_entries_) {
       delete[] compression_entries_;
       compression_entries_ = NULL;
@@ -4918,7 +4935,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
   const long long stop = start + size;
 
   // Count ContentEncoding elements.
-  int count = 0;
+  long long count = 0;
   while (pos < stop) {
     long long id, size;
     const long status = ParseElementHeader(pReader, pos, stop, id, size);
@@ -4926,8 +4943,11 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
       return status;
 
     // pos now designates start of element
-    if (id == libwebm::kMkvContentEncoding)
+    if (id == libwebm::kMkvContentEncoding) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4937,7 +4957,8 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
   if (count <= 0)
     return -1;
 
-  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
+  content_encoding_entries_ =
+      new (std::nothrow) ContentEncoding*[static_cast<size_t>(count)];
   if (!content_encoding_entries_)
     return -1;
 
@@ -5229,6 +5250,8 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
 
       projection_ptr->type = static_cast<ProjectionType>(projection_type);
     } else if (child_id == libwebm::kMkvProjectionPrivate) {
+      if (projection_ptr->private_data != NULL)
+        return false;
       unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
 
       if (data == NULL)
@@ -5286,6 +5309,7 @@ VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
       m_projection(NULL) {}
 
 VideoTrack::~VideoTrack() {
+  delete[] m_colour_space;
   delete m_colour;
   delete m_projection;
 }
@@ -5307,7 +5331,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   long long stereo_mode = 0;
 
   double rate = 0.0;
-  char* colour_space = NULL;
+  std::unique_ptr<char[]> colour_space_ptr;
 
   IMkvReader* const pReader = pSegment->m_pReader;
 
@@ -5384,9 +5408,11 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
         projection_ptr.reset(projection);
       }
     } else if (id == libwebm::kMkvColourSpace) {
+      char* colour_space = NULL;
       const long status = UnserializeString(pReader, pos, size, colour_space);
       if (status < 0)
         return status;
+      colour_space_ptr.reset(colour_space);
     }
 
     pos += size;  // consume payload
@@ -5418,7 +5444,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
   pTrack->m_colour = colour_ptr.release();
-  pTrack->m_colour_space = colour_space;
+  pTrack->m_colour_space = colour_space_ptr.release();
   pTrack->m_projection = projection_ptr.release();
 
   pResult = pTrack;
@@ -5648,7 +5674,7 @@ long Tracks::Parse() {
   const long long stop = m_start + m_size;
   IMkvReader* const pReader = m_pSegment->m_pReader;
 
-  int count = 0;
+  long long count = 0;
   long long pos = m_start;
 
   while (pos < stop) {
@@ -5662,8 +5688,11 @@ long Tracks::Parse() {
     if (size == 0)  // weird
       continue;
 
-    if (id == libwebm::kMkvTrackEntry)
+    if (id == libwebm::kMkvTrackEntry) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -5676,7 +5705,7 @@ long Tracks::Parse() {
   if (count <= 0)
     return 0;  // success
 
-  m_trackEntries = new (std::nothrow) Track*[count];
+  m_trackEntries = new (std::nothrow) Track*[static_cast<size_t>(count)];
 
   if (m_trackEntries == NULL)
     return -1;
diff --git a/tools.mk b/tools.mk
index 1d005b2acf..dd2ebeb3d5 100644
--- a/tools.mk
+++ b/tools.mk
@@ -79,6 +79,7 @@ $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
             --ver=$$(CONFIG_VS_VERSION)\
             --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$$(AS) \
             $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
             $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^
diff --git a/tools/3D-Reconstruction/MotionEST/Exhaust.py b/tools/3D-Reconstruction/MotionEST/Exhaust.py
index 2d6a4d8114..d763de8562 100644
--- a/tools/3D-Reconstruction/MotionEST/Exhaust.py
+++ b/tools/3D-Reconstruction/MotionEST/Exhaust.py
@@ -83,7 +83,7 @@ def __init__(self, cur_f, ref_f, blk_size, wnd_size, beta, metric=MSE):
     self.beta = beta
     self.metric = metric
     super(ExhaustNeighbor, self).__init__(cur_f, ref_f, blk_size)
-    self.assign = np.zeros((self.num_row, self.num_col), dtype=np.bool)
+    self.assign = np.zeros((self.num_row, self.num_col), dtype=bool)
 
   """
     estimate neighbor loss:
diff --git a/tools/3D-Reconstruction/MotionEST/GroundTruth.py b/tools/3D-Reconstruction/MotionEST/GroundTruth.py
index 12bc53ff73..37305898a7 100644
--- a/tools/3D-Reconstruction/MotionEST/GroundTruth.py
+++ b/tools/3D-Reconstruction/MotionEST/GroundTruth.py
@@ -29,7 +29,7 @@ class GroundTruth(MotionEST):
   def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None):
     self.name = 'ground truth'
     super(GroundTruth, self).__init__(cur_f, ref_f, blk_sz)
-    self.mask = np.zeros((self.num_row, self.num_col), dtype=np.bool)
+    self.mask = np.zeros((self.num_row, self.num_col), dtype=bool)
     if gt_path:
       with open(gt_path) as gt_file:
         lines = gt_file.readlines()
@@ -42,7 +42,7 @@ def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None):
               self.mask[i, -j - 1] = True
               continue
             #the order of original file is flipped on the x axis
-            self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=np.int)
+            self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=int)
     else:
       self.mf = mf
       self.mask = mask
diff --git a/tools/3D-Reconstruction/MotionEST/MotionEST.py b/tools/3D-Reconstruction/MotionEST/MotionEST.py
index 0959530fa0..fc393818d9 100644
--- a/tools/3D-Reconstruction/MotionEST/MotionEST.py
+++ b/tools/3D-Reconstruction/MotionEST/MotionEST.py
@@ -28,8 +28,8 @@ def __init__(self, cur_f, ref_f, blk_sz):
     self.ref_f = ref_f
     self.blk_sz = blk_sz
     #convert RGB to YUV
-    self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=np.int)
-    self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=np.int)
+    self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=int)
+    self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=int)
     #frame size
     self.width = self.cur_f.size[0]
     self.height = self.cur_f.size[1]
diff --git a/tools/3D-Reconstruction/MotionEST/Util.py b/tools/3D-Reconstruction/MotionEST/Util.py
index 551881cfd7..c2416163be 100644
--- a/tools/3D-Reconstruction/MotionEST/Util.py
+++ b/tools/3D-Reconstruction/MotionEST/Util.py
@@ -18,7 +18,7 @@
 def MSE(blk1, blk2):
   return np.mean(
       LA.norm(
-          np.array(blk1, dtype=np.int) - np.array(blk2, dtype=np.int), axis=2))
+          np.array(blk1, dtype=int) - np.array(blk2, dtype=int), axis=2))
 
 
 def drawMF(img, blk_sz, mf):
diff --git a/tools/cpplint.py b/tools/cpplint.py
index 25fbef73d8..e3ebde2f5a 100755
--- a/tools/cpplint.py
+++ b/tools/cpplint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -51,16 +51,23 @@
 import string
 import sys
 import unicodedata
+import sysconfig
+
+try:
+  xrange          # Python 2
+except NameError:
+  xrange = range  # Python 3
 
 
 _USAGE = """
 Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
                    [--counting=total|toplevel|detailed] [--root=subdir]
-                   [--linelength=digits]
+                   [--linelength=digits] [--headers=x,y,...]
+                   [--quiet]
         <file> [file] ...
 
   The style guidelines this tries to follow are those in
-    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+    https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
 
   Every problem is given a confidence score from 1-5, with 5 meaning we are
   certain of the problem, and 1 meaning it could be a legitimate construct.
@@ -83,6 +90,9 @@
     verbose=#
       Specify a number 0-5 to restrict errors to certain verbosity levels.
 
+    quiet
+      Don't print anything if no errors are found.
+
     filter=-x,+y,...
       Specify a comma-separated list of category-filters to apply: only
       error messages whose category names pass the filters will be printed.
@@ -114,12 +124,13 @@
       ignored.
 
       Examples:
-        Assuing that src/.git exists, the header guard CPP variables for
-        src/chrome/browser/ui/browser.h are:
+        Assuming that top/src/.git exists (and cwd=top/src), the header guard
+        CPP variables for top/src/chrome/browser/ui/browser.h are:
 
         No flag => CHROME_BROWSER_UI_BROWSER_H_
         --root=chrome => BROWSER_UI_BROWSER_H_
         --root=chrome/browser => UI_BROWSER_H_
+        --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_
 
     linelength=digits
       This is the allowed line length for the project. The default value is
@@ -133,6 +144,57 @@
 
       Examples:
         --extensions=hpp,cpp
+
+    headers=x,y,...
+      The header extensions that cpplint will treat as .h in checks. Values are
+      automatically added to --extensions list.
+
+      Examples:
+        --headers=hpp,hxx
+        --headers=hpp
+
+    cpplint.py supports per-directory configurations specified in CPPLINT.cfg
+    files. CPPLINT.cfg file can contain a number of key=value pairs.
+    Currently the following options are supported:
+
+      set noparent
+      filter=+filter1,-filter2,...
+      exclude_files=regex
+      linelength=80
+      root=subdir
+      headers=x,y,...
+
+    "set noparent" option prevents cpplint from traversing directory tree
+    upwards looking for more .cfg files in parent directories. This option
+    is usually placed in the top-level project directory.
+
+    The "filter" option is similar in function to --filter flag. It specifies
+    message filters in addition to the |_DEFAULT_FILTERS| and those specified
+    through --filter command-line flag.
+
+    "exclude_files" allows to specify a regular expression to be matched against
+    a file name. If the expression matches, the file is skipped and not run
+    through liner.
+
+    "linelength" allows to specify the allowed line length for the project.
+
+    The "root" option is similar in function to the --root flag (see example
+    above). Paths are relative to the directory of the CPPLINT.cfg.
+
+    The "headers" option is similar in function to the --headers flag
+    (see example above).
+
+    CPPLINT.cfg has an effect on files in the same directory and all
+    sub-directories, unless overridden by a nested configuration file.
+
+      Example file:
+        filter=-build/include_order,+build/include_alpha
+        exclude_files=.*\.cc
+
+    The above example disables build/include_order warning and enables
+    build/include_alpha as well as excludes all .cc from being
+    processed by linter, in the current directory (where the .cfg
+    file is located) and all sub-directories.
 """
 
 # We categorize each error message we print.  Here are the categories.
@@ -140,81 +202,101 @@
 # If you add a new error message with a new category, add it to the list
 # here!  cpplint_unittest.py should tell you if you forget to do this.
 _ERROR_CATEGORIES = [
-  'build/class',
-  'build/deprecated',
-  'build/endif_comment',
-  'build/explicit_make_pair',
-  'build/forward_decl',
-  'build/header_guard',
-  'build/include',
-  'build/include_alpha',
-  'build/include_order',
-  'build/include_what_you_use',
-  'build/namespaces',
-  'build/printf_format',
-  'build/storage_class',
-  'legal/copyright',
-  'readability/alt_tokens',
-  'readability/braces',
-  'readability/casting',
-  'readability/check',
-  'readability/constructors',
-  'readability/fn_size',
-  'readability/function',
-  'readability/multiline_comment',
-  'readability/multiline_string',
-  'readability/namespace',
-  'readability/nolint',
-  'readability/nul',
-  'readability/streams',
-  'readability/todo',
-  'readability/utf8',
-  'runtime/arrays',
-  'runtime/casting',
-  'runtime/explicit',
-  'runtime/int',
-  'runtime/init',
-  'runtime/invalid_increment',
-  'runtime/member_string_references',
-  'runtime/memset',
-  'runtime/operator',
-  'runtime/printf',
-  'runtime/printf_format',
-  'runtime/references',
-  'runtime/sizeof',
-  'runtime/string',
-  'runtime/threadsafe_fn',
-  'runtime/vlog',
-  'whitespace/blank_line',
-  'whitespace/braces',
-  'whitespace/comma',
-  'whitespace/comments',
-  'whitespace/empty_conditional_body',
-  'whitespace/empty_loop_body',
-  'whitespace/end_of_line',
-  'whitespace/ending_newline',
-  'whitespace/forcolon',
-  'whitespace/indent',
-  'whitespace/line_length',
-  'whitespace/newline',
-  'whitespace/operators',
-  'whitespace/parens',
-  'whitespace/semicolon',
-  'whitespace/tab',
-  'whitespace/todo'
-  ]
-
-# The default state of the category filter. This is overrided by the --filter=
+    'build/class',
+    'build/c++11',
+    'build/c++14',
+    'build/c++tr1',
+    'build/deprecated',
+    'build/endif_comment',
+    'build/explicit_make_pair',
+    'build/forward_decl',
+    'build/header_guard',
+    'build/include',
+    'build/include_alpha',
+    'build/include_order',
+    'build/include_what_you_use',
+    'build/namespaces',
+    'build/printf_format',
+    'build/storage_class',
+    'legal/copyright',
+    'readability/alt_tokens',
+    'readability/braces',
+    'readability/casting',
+    'readability/check',
+    'readability/constructors',
+    'readability/fn_size',
+    'readability/inheritance',
+    'readability/multiline_comment',
+    'readability/multiline_string',
+    'readability/namespace',
+    'readability/nolint',
+    'readability/nul',
+    'readability/strings',
+    'readability/todo',
+    'readability/utf8',
+    'runtime/arrays',
+    'runtime/casting',
+    'runtime/explicit',
+    'runtime/int',
+    'runtime/init',
+    'runtime/invalid_increment',
+    'runtime/member_string_references',
+    'runtime/memset',
+    'runtime/indentation_namespace',
+    'runtime/operator',
+    'runtime/printf',
+    'runtime/printf_format',
+    'runtime/references',
+    'runtime/string',
+    'runtime/threadsafe_fn',
+    'runtime/vlog',
+    'whitespace/blank_line',
+    'whitespace/braces',
+    'whitespace/comma',
+    'whitespace/comments',
+    'whitespace/empty_conditional_body',
+    'whitespace/empty_if_body',
+    'whitespace/empty_loop_body',
+    'whitespace/end_of_line',
+    'whitespace/ending_newline',
+    'whitespace/forcolon',
+    'whitespace/indent',
+    'whitespace/line_length',
+    'whitespace/newline',
+    'whitespace/operators',
+    'whitespace/parens',
+    'whitespace/semicolon',
+    'whitespace/tab',
+    'whitespace/todo',
+    ]
+
+# These error categories are no longer enforced by cpplint, but for backwards-
+# compatibility they may still appear in NOLINT comments.
+_LEGACY_ERROR_CATEGORIES = [
+    'readability/streams',
+    'readability/function',
+    ]
+
+# The default state of the category filter. This is overridden by the --filter=
 # flag. By default all errors are on, so only add here categories that should be
 # off by default (i.e., categories that must be enabled by the --filter= flags).
 # All entries here should start with a '-' or '+', as in the --filter= flag.
 _DEFAULT_FILTERS = ['-build/include_alpha']
 
+# The default list of categories suppressed for C (not C++) files.
+_DEFAULT_C_SUPPRESSED_CATEGORIES = [
+    'readability/casting',
+    ]
+
+# The default list of categories suppressed for Linux Kernel files.
+_DEFAULT_KERNEL_SUPPRESSED_CATEGORIES = [
+    'whitespace/tab',
+    ]
+
 # We used to check for high-bit characters, but after much discussion we
 # decided those were OK, as long as they were in UTF-8 and didn't represent
 # hard-coded international strings, which belong in a separate i18n file.
 
-
 # C++ headers
 _CPP_HEADERS = frozenset([
     # Legacy
@@ -304,6 +386,7 @@
     'random',
     'ratio',
     'regex',
+    'scoped_allocator',
     'set',
     'sstream',
     'stack',
@@ -351,15 +434,40 @@
     'cwctype',
     ])
 
+# Type names
+_TYPES = re.compile(
+    r'^(?:'
+    # [dcl.type.simple]
+    r'(char(16_t|32_t)?)|wchar_t|'
+    r'bool|short|int|long|signed|unsigned|float|double|'
+    # [support.types]
+    r'(ptrdiff_t|size_t|max_align_t|nullptr_t)|'
+    # [cstdint.syn]
+    r'(u?int(_fast|_least)?(8|16|32|64)_t)|'
+    r'(u?int(max|ptr)_t)|'
+    r')$')
+
+
+# These headers are excluded from [build/include] and [build/include_order]
+# checks:
+# - Anything not following google file name conventions (containing an
+#   uppercase character, such as Python.h or nsStringAPI.h, for example).
+# - Lua headers.
+_THIRD_PARTY_HEADERS_PATTERN = re.compile(
+    r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$')
+
+# Pattern for matching FileInfo.BaseName() against test file name
+_TEST_FILE_SUFFIX = r'(_test|_unittest|_regtest)$'
+
+# Pattern that matches only complete whitespace, possibly across multiple lines.
+_EMPTY_CONDITIONAL_BODY_PATTERN = re.compile(r'^\s*$', re.DOTALL)
+
 # Assertion macros.  These are defined in base/logging.h and
-# testing/base/gunit.h.  Note that the _M versions need to come first
-# for substring matching to work.
+# testing/base/public/gunit.h.
 _CHECK_MACROS = [
     'DCHECK', 'CHECK',
-    'EXPECT_TRUE_M', 'EXPECT_TRUE',
-    'ASSERT_TRUE_M', 'ASSERT_TRUE',
-    'EXPECT_FALSE_M', 'EXPECT_FALSE',
-    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    'EXPECT_TRUE', 'ASSERT_TRUE',
+    'EXPECT_FALSE', 'ASSERT_FALSE',
     ]
 
 # Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
@@ -372,16 +480,12 @@
   _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
   _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
   _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
 
 for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
                             ('>=', 'LT'), ('>', 'LE'),
                             ('<=', 'GT'), ('<', 'GE')]:
   _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
   _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
 
 # Alternative tokens and their replacements.  For full list, see section 2.5
 # Alternative tokens [lex.digraph] in the C++ standard.
@@ -430,11 +534,14 @@
                         r'(?:\s+(volatile|__volatile__))?'
                         r'\s*[{(]')
 
+# Match strings that indicate we're working on a C (not C++) file.
+_SEARCH_C_FILE = re.compile(r'\b(?:LINT_C_FILE|'
+                            r'vim?:\s*.*(\s*|:)filetype=c(\s*|:|$))')
 
-_regexp_compile_cache = {}
+# Match string that indicates we're working on a Linux Kernel file.
+_SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)')
 
-# Finds occurrences of NOLINT or NOLINT(...).
-_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
+_regexp_compile_cache = {}
 
 # {str, set(int)}: a map from error categories to sets of linenumbers
 # on which those errors are expected and should be suppressed.
@@ -443,6 +550,7 @@
 # The root directory used for deriving header guard CPP variable.
 # This is set by --root flag.
 _root = None
+_root_debug = False
 
 # The allowed line length of files.
 # This is set by --linelength flag.
@@ -452,8 +560,28 @@
 # This is set by --extensions flag.
 _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
 
+# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc.
+# This is set by --headers flag.
+_hpp_headers = set(['h'])
+
+# {str, bool}: a map from error categories to booleans which indicate if the
+# category should be suppressed for every line.
+_global_error_suppressions = {}
+
+def ProcessHppHeadersOption(val):
+  global _hpp_headers
+  try:
+    _hpp_headers = set(val.split(','))
+    # Automatically append to extensions list so it does not have to be set 2 times
+    _valid_extensions.update(_hpp_headers)
+  except ValueError:
+    PrintUsage('Header extensions must be comma separated list.')
+
+def IsHeaderExtension(file_extension):
+  return file_extension in _hpp_headers
+
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
-  """Updates the global list of error-suppressions.
+  """Updates the global list of line error-suppressions.
 
   Parses any NOLINT comments on the current line, updating the global
   error_suppressions store.  Reports an error if the NOLINT comment
@@ -465,42 +593,67 @@ def ParseNolintSuppressions(filename, raw_line, linenum, error):
     linenum: int, the number of the current line.
     error: function, an error handler.
   """
-  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
-  matched = _RE_SUPPRESSION.search(raw_line)
+  matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line)
   if matched:
-    category = matched.group(1)
+    if matched.group(1):
+      suppressed_line = linenum + 1
+    else:
+      suppressed_line = linenum
+    category = matched.group(2)
     if category in (None, '(*)'):  # => "suppress all"
-      _error_suppressions.setdefault(None, set()).add(linenum)
+      _error_suppressions.setdefault(None, set()).add(suppressed_line)
     else:
       if category.startswith('(') and category.endswith(')'):
         category = category[1:-1]
         if category in _ERROR_CATEGORIES:
-          _error_suppressions.setdefault(category, set()).add(linenum)
-        else:
+          _error_suppressions.setdefault(category, set()).add(suppressed_line)
+        elif category not in _LEGACY_ERROR_CATEGORIES:
           error(filename, linenum, 'readability/nolint', 5,
                 'Unknown NOLINT error category: %s' % category)
 
 
+def ProcessGlobalSuppresions(lines):
+  """Updates the list of global error suppressions.
+
+  Parses any lint directives in the file that have global effect.
+
+  Args:
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+  """
+  for line in lines:
+    if _SEARCH_C_FILE.search(line):
+      for category in _DEFAULT_C_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+    if _SEARCH_KERNEL_FILE.search(line):
+      for category in _DEFAULT_KERNEL_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+
+
 def ResetNolintSuppressions():
-  "Resets the set of NOLINT suppressions to empty."
+  """Resets the set of NOLINT suppressions to empty."""
   _error_suppressions.clear()
+  _global_error_suppressions.clear()
 
 
 def IsErrorSuppressedByNolint(category, linenum):
   """Returns true if the specified error category is suppressed on this line.
 
   Consults the global error_suppressions map populated by
-  ParseNolintSuppressions/ResetNolintSuppressions.
+  ParseNolintSuppressions/ProcessGlobalSuppresions/ResetNolintSuppressions.
 
   Args:
     category: str, the category of the error.
     linenum: int, the current line number.
   Returns:
-    bool, True iff the error should be suppressed due to a NOLINT comment.
+    bool, True iff the error should be suppressed due to a NOLINT comment or
+    global suppression.
   """
-  return (linenum in _error_suppressions.get(category, set()) or
+  return (_global_error_suppressions.get(category, False) or
+          linenum in _error_suppressions.get(category, set()) or
           linenum in _error_suppressions.get(None, set()))
 
+
 def Match(pattern, s):
   """Matches the string with the pattern, caching the compiled regexp."""
   # The regexp compilation caching is inlined in both Match and Search for
@@ -536,11 +689,17 @@ def Search(pattern, s):
   return _regexp_compile_cache[pattern].search(s)
 
 
-class _IncludeState(dict):
+def _IsSourceExtension(s):
+  """File extension (excluding dot) matches a source file extension."""
+  return s in ('c', 'cc', 'cpp', 'cxx')
+
+
+class _IncludeState(object):
   """Tracks line numbers for includes, and the order in which includes appear.
 
-  As a dict, an _IncludeState object serves as a mapping between include
-  filename and line number on which that file was included.
+  include_list contains list of lists of (header, line number) pairs.
+  It's a lists of lists rather than just one flat list to make it
+  easier to update across preprocessor boundaries.
 
   Call CheckNextIncludeOrder() once for each header in the file, passing
   in the type constants defined above. Calls in an illegal order will
@@ -571,15 +730,42 @@ class _IncludeState(dict):
       }
 
   def __init__(self):
-    dict.__init__(self)
-    self.ResetSection()
+    self.include_list = [[]]
+    self.ResetSection('')
+
+  def FindHeader(self, header):
+    """Check if a header has already been included.
+
+    Args:
+      header: header to check.
+    Returns:
+      Line number of previous occurrence, or -1 if the header has not
+      been seen before.
+    """
+    for section_list in self.include_list:
+      for f in section_list:
+        if f[0] == header:
+          return f[1]
+    return -1
+
+  def ResetSection(self, directive):
+    """Reset section checking for preprocessor directive.
 
-  def ResetSection(self):
+    Args:
+      directive: preprocessor directive (e.g. "if", "else").
+    """
     # The name of the current section.
     self._section = self._INITIAL_SECTION
     # The path of last found header.
     self._last_header = ''
 
+    # Update list of includes.  Note that we never pop from the
+    # include list.
+    if directive in ('if', 'ifdef', 'ifndef'):
+      self.include_list.append([])
+    elif directive in ('else', 'elif'):
+      self.include_list[-1] = []
+
   def SetLastHeader(self, header_path):
     self._last_header = header_path
 
@@ -615,7 +801,7 @@ def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
     # If previous line was a blank line, assume that the headers are
     # intentionally sorted the way they are.
     if (self._last_header > header_path and
-        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+        Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
       return False
     return True
 
@@ -681,8 +867,11 @@ def __init__(self):
     self.error_count = 0    # global count of reported errors
     # filters to apply when emitting error messages
     self.filters = _DEFAULT_FILTERS[:]
+    # backup of filter list. Used to restore the state after each file.
+    self._filters_backup = self.filters[:]
     self.counting = 'total'  # In what way are we counting errors?
     self.errors_by_category = {}  # string to int dict storing error counts
+    self.quiet = False  # Suppress non-error messagess?
 
     # output format:
     # "emacs" - format that emacs can parse (default)
@@ -693,6 +882,12 @@ def SetOutputFormat(self, output_format):
     """Sets the output format for errors."""
     self.output_format = output_format
 
+  def SetQuiet(self, quiet):
+    """Sets the module's quiet settings, and returns the previous setting."""
+    last_quiet = self.quiet
+    self.quiet = quiet
+    return last_quiet
+
   def SetVerboseLevel(self, level):
     """Sets the module's verbosity, and returns the previous setting."""
     last_verbose_level = self.verbose_level
@@ -719,6 +914,10 @@ def SetFilters(self, filters):
     """
     # Default filters always have less priority than the flag ones.
     self.filters = _DEFAULT_FILTERS[:]
+    self.AddFilters(filters)
+
+  def AddFilters(self, filters):
+    """ Adds more filters to the existing list of error-message filters. """
     for filt in filters.split(','):
       clean_filt = filt.strip()
       if clean_filt:
@@ -728,6 +927,14 @@ def SetFilters(self, filters):
         raise ValueError('Every filter in --filters must start with + or -'
                          ' (%s does not)' % filt)
 
+  def BackupFilters(self):
+    """ Saves the current filter list to backup storage."""
+    self._filters_backup = self.filters[:]
+
+  def RestoreFilters(self):
+    """ Restores filters previously backed up."""
+    self.filters = self._filters_backup[:]
+
   def ResetErrorCounts(self):
     """Sets the module's error statistic back to zero."""
     self.error_count = 0
@@ -748,7 +955,7 @@ def PrintErrorCounts(self):
     for category, count in self.errors_by_category.iteritems():
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
-    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+    sys.stdout.write('Total errors found: %d\n' % self.error_count)
 
 _cpplint_state = _CppLintState()
 
@@ -762,6 +969,14 @@ def _SetOutputFormat(output_format):
   """Sets the module's output format."""
   _cpplint_state.SetOutputFormat(output_format)
 
+def _Quiet():
+  """Return's the module's quiet setting."""
+  return _cpplint_state.quiet
+
+def _SetQuiet(quiet):
+  """Set the module's quiet status, and return previous setting."""
+  return _cpplint_state.SetQuiet(quiet)
+
 
 def _VerboseLevel():
   """Returns the module's verbosity setting."""
@@ -795,6 +1010,25 @@ def _SetFilters(filters):
   """
   _cpplint_state.SetFilters(filters)
 
+def _AddFilters(filters):
+  """Adds more filter overrides.
+
+  Unlike _SetFilters, this function does not reset the current list of filters
+  available.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.AddFilters(filters)
+
+def _BackupFilters():
+  """ Saves the current filter list to backup storage."""
+  _cpplint_state.BackupFilters()
+
+def _RestoreFilters():
+  """ Restores filters previously backed up."""
+  _cpplint_state.RestoreFilters()
 
 class _FunctionState(object):
   """Tracks current function name and the number of lines in its body."""
@@ -830,6 +1064,9 @@ def Check(self, error, filename, linenum):
       filename: The name of the current file.
       linenum: The number of the line to check.
     """
+    if not self.in_a_function:
+      return
+
     if Match(r'T(EST|est)', self.current_function):
       base_trigger = self._TEST_TRIGGER
     else:
@@ -857,7 +1094,7 @@ class _IncludeError(Exception):
   pass
 
 
-class FileInfo:
+class FileInfo(object):
   """Provides utility functions for filenames.
 
   FileInfo provides easy access to the components of a file's path
@@ -900,12 +1137,13 @@ def RepositoryName(self):
 
       # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
       # searching up from the current path.
-      root_dir = os.path.dirname(fullname)
-      while (root_dir != os.path.dirname(root_dir) and
-             not os.path.exists(os.path.join(root_dir, ".git")) and
-             not os.path.exists(os.path.join(root_dir, ".hg")) and
-             not os.path.exists(os.path.join(root_dir, ".svn"))):
-        root_dir = os.path.dirname(root_dir)
+      root_dir = current_dir = os.path.dirname(fullname)
+      while current_dir != os.path.dirname(current_dir):
+        if (os.path.exists(os.path.join(current_dir, ".git")) or
+            os.path.exists(os.path.join(current_dir, ".hg")) or
+            os.path.exists(os.path.join(current_dir, ".svn"))):
+          root_dir = current_dir
+        current_dir = os.path.dirname(current_dir)
 
       if (os.path.exists(os.path.join(root_dir, ".git")) or
           os.path.exists(os.path.join(root_dir, ".hg")) or
@@ -944,7 +1182,7 @@ def NoExtension(self):
 
   def IsSource(self):
     """File has a source file extension."""
-    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+    return _IsSourceExtension(self.Extension()[1:])
 
 
 def _ShouldPrintError(category, confidence, linenum):
@@ -955,6 +1193,7 @@ def _ShouldPrintError(category, confidence, linenum):
   # the verbosity level isn't high enough, or the filters filter it out.
   if IsErrorSuppressedByNolint(category, linenum):
     return False
+
   if confidence < _cpplint_state.verbose_level:
     return False
 
@@ -999,8 +1238,8 @@ def Error(filename, linenum, category, confidence, message):
   if _ShouldPrintError(category, confidence, linenum):
     _cpplint_state.IncrementErrorCount(category)
     if _cpplint_state.output_format == 'vs7':
-      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
+      sys.stderr.write('%s(%s): error cpplint: [%s] %s [%d]\n' % (
+          filename, linenum, category, message, confidence))
     elif _cpplint_state.output_format == 'eclipse':
       sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
           filename, linenum, message, category, confidence))
@@ -1012,11 +1251,9 @@ def Error(filename, linenum, category, confidence, message):
 # Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
 _RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
     r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
-# Matches strings.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
-# Matches characters.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
-# Matches multi-line C++ comments.
+# Match a single C style comment on the same line.
+_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/'
+# Matches multi-line C style comments.
 # This RE is a little bit more complicated than one might expect, because we
 # have to take care of space removals tools so we can handle comments inside
 # statements better.
@@ -1025,10 +1262,10 @@ def Error(filename, linenum, category, confidence, message):
 # if this doesn't work we try on left side but only if there's a non-character
 # on the right.
 _RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
-    r"""(\s*/\*.*\*/\s*$|
-            /\*.*\*/\s+|
-         \s+/\*.*\*/(?=\W)|
-            /\*.*\*/)""", re.VERBOSE)
+    r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' +
+    _RE_PATTERN_C_COMMENTS + r'\s+|' +
+    r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
+    _RE_PATTERN_C_COMMENTS + r')')
 
 
 def IsCppString(line):
@@ -1083,13 +1320,26 @@ def CleanseRawStrings(raw_lines):
         delimiter = None
       else:
         # Haven't found the end yet, append a blank line.
-        line = ''
+        line = '""'
 
-    else:
+    # Look for beginning of a raw string, and replace them with
+    # empty strings.  This is done in a loop to handle multiple raw
+    # strings on the same line.
+    while delimiter is None:
       # Look for beginning of a raw string.
       # See 2.14.15 [lex.string] for syntax.
-      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
-      if matched:
+      #
+      # Once we have matched a raw string, we check the prefix of the
+      # line to make sure that the line is not part of a single line
+      # comment.  It's done this way because we remove raw strings
+      # before removing comments as opposed to removing comments
+      # before removing raw strings.  This is because there are some
+      # cpplint checks that requires the comments to be preserved, but
+      # we don't want to check comments that are inside raw strings.
+      matched = Match(r'^(.*?)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if (matched and
+          not Match(r'^([^\'"]|\'(\\.|[^\'])*\'|"(\\.|[^"])*")*//',
+                    matched.group(1))):
         delimiter = ')' + matched.group(2) + '"'
 
         end = matched.group(3).find(delimiter)
@@ -1101,6 +1351,8 @@ def CleanseRawStrings(raw_lines):
         else:
           # Start of a multi-line raw string
           line = matched.group(1) + '""'
+      else:
+        break
 
     lines_without_raw_strings.append(line)
 
@@ -1131,10 +1383,10 @@ def FindNextMultiLineCommentEnd(lines, lineix):
 
 def RemoveMultiLineCommentsFromRange(lines, begin, end):
   """Clears a range of lines for multi-line comments."""
-  # Having // dummy comments makes the lines non-empty, so we will not get
+  # Having // <empty> comments makes the lines non-empty, so we will not get
   # unnecessary blank line warnings later in the code.
   for i in range(begin, end):
-    lines[i] = '// dummy'
+    lines[i] = '/**/'
 
 
 def RemoveMultiLineComments(filename, lines, error):
@@ -1170,12 +1422,14 @@ def CleanseComments(line):
 
 
 class CleansedLines(object):
-  """Holds 3 copies of all lines with different preprocessing applied to them.
+  """Holds 4 copies of all lines with different preprocessing applied to them.
 
-  1) elided member contains lines without strings and comments,
-  2) lines member contains lines without comments, and
+  1) elided member contains lines without strings and comments.
+  2) lines member contains lines without comments.
   3) raw_lines member contains all the lines without processing.
-  All these three members are of <type 'list'>, and of the same length.
+  4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw
+     strings removed.
+  All these members are of <type 'list'>, and of the same length.
   """
 
   def __init__(self, lines):
@@ -1206,38 +1460,138 @@ def _CollapseStrings(elided):
     Returns:
       The line with collapsed strings.
     """
-    if not _RE_PATTERN_INCLUDE.match(elided):
-      # Remove escaped characters first to make quote/single quote collapsing
-      # basic.  Things that look like escaped characters shouldn't occur
-      # outside of strings and chars.
-      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
-    return elided
+    if _RE_PATTERN_INCLUDE.match(elided):
+      return elided
+
+    # Remove escaped characters first to make quote/single quote collapsing
+    # basic.  Things that look like escaped characters shouldn't occur
+    # outside of strings and chars.
+    elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+
+    # Replace quoted strings and digit separators.  Both single quotes
+    # and double quotes are processed in the same loop, otherwise
+    # nested quotes wouldn't work.
+    collapsed = ''
+    while True:
+      # Find the first quote character
+      match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
+      if not match:
+        collapsed += elided
+        break
+      head, quote, tail = match.groups()
+
+      if quote == '"':
+        # Collapse double quoted strings
+        second_quote = tail.find('"')
+        if second_quote >= 0:
+          collapsed += head + '""'
+          elided = tail[second_quote + 1:]
+        else:
+          # Unmatched double quote, don't bother processing the rest
+          # of the line since this is probably a multiline string.
+          collapsed += elided
+          break
+      else:
+        # Found single quote, check nearby text to eliminate digit separators.
+        #
+        # There is no special handling for floating point here, because
+        # the integer/fractional/exponent parts would all be parsed
+        # correctly as long as there are digits on both sides of the
+        # separator.  So we are fine as long as we don't see something
+        # like "0.'3" (gcc 4.9.0 will not allow this literal).
+        if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
+          match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail)
+          collapsed += head + match_literal.group(1).replace("'", '')
+          elided = match_literal.group(2)
+        else:
+          second_quote = tail.find('\'')
+          if second_quote >= 0:
+            collapsed += head + "''"
+            elided = tail[second_quote + 1:]
+          else:
+            # Unmatched single quote
+            collapsed += elided
+            break
 
+    return collapsed
 
-def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
-  """Find the position just after the matching endchar.
+
+def FindEndOfExpressionInLine(line, startpos, stack):
+  """Find the position just after the end of current parenthesized expression.
 
   Args:
     line: a CleansedLines line.
     startpos: start searching at this position.
-    depth: nesting level at startpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at startpos.
 
   Returns:
-    On finding matching endchar: (index just after matching endchar, 0)
-    Otherwise: (-1, new depth at end of this line)
+    On finding matching end: (index just after matching end, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at end of this line)
   """
   for i in xrange(startpos, len(line)):
-    if line[i] == startchar:
-      depth += 1
-    elif line[i] == endchar:
-      depth -= 1
-      if depth == 0:
-        return (i + 1, 0)
-  return (-1, depth)
+    char = line[i]
+    if char in '([{':
+      # Found start of parenthesized expression, push to expression stack
+      stack.append(char)
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        if stack and stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (-1, None)
+      elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
+        # operator<, don't add to stack
+        continue
+      else:
+        # Tentative start of template argument list
+        stack.append('<')
+    elif char in ')]}':
+      # Found end of parenthesized expression.
+      #
+      # If we are currently expecting a matching '>', the pending '<'
+      # must have been an operator.  Remove them from expression stack.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((stack[-1] == '(' and char == ')') or
+          (stack[-1] == '[' and char == ']') or
+          (stack[-1] == '{' and char == '}')):
+        stack.pop()
+        if not stack:
+          return (i + 1, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == '>':
+      # Found potential end of template argument list.
+
+      # Ignore "->" and operator functions
+      if (i > 0 and
+          (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))):
+        continue
+
+      # Pop the stack if there is a matching '<'.  Otherwise, ignore
+      # this '>' since it must be an operator.
+      if stack:
+        if stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (i + 1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '>', the matching '<' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+  # Did not find end of expression or unbalanced parentheses on this line
+  return (-1, stack)
 
 
 def CloseExpression(clean_lines, linenum, pos):
@@ -1246,6 +1600,11 @@ def CloseExpression(clean_lines, linenum, pos):
   If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
   linenum/pos that correspond to the closing of the expression.
 
+  TODO(unknown): cpplint spends a fair bit of time matching parentheses.
+  Ideally we would want to index all opening and closing parentheses once
+  and have CloseExpression be just a simple lookup, but due to preprocessor
+  tricks, this is not so easy.
+
   Args:
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
@@ -1259,35 +1618,28 @@ def CloseExpression(clean_lines, linenum, pos):
   """
 
   line = clean_lines.elided[linenum]
-  startchar = line[pos]
-  if startchar not in '({[<':
+  if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
     return (line, clean_lines.NumLines(), -1)
-  if startchar == '(': endchar = ')'
-  if startchar == '[': endchar = ']'
-  if startchar == '{': endchar = '}'
-  if startchar == '<': endchar = '>'
 
   # Check first line
-  (end_pos, num_open) = FindEndOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
   if end_pos > -1:
     return (line, linenum, end_pos)
 
   # Continue scanning forward
-  while linenum < clean_lines.NumLines() - 1:
+  while stack and linenum < clean_lines.NumLines() - 1:
     linenum += 1
     line = clean_lines.elided[linenum]
-    (end_pos, num_open) = FindEndOfExpressionInLine(
-        line, 0, num_open, startchar, endchar)
+    (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
     if end_pos > -1:
       return (line, linenum, end_pos)
 
-  # Did not find endchar before end of file, give up
+  # Did not find end of expression before end of file, give up
   return (line, clean_lines.NumLines(), -1)
 
 
-def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
-  """Find position at the matching startchar.
+def FindStartOfExpressionInLine(line, endpos, stack):
+  """Find position at the matching start of current expression.
 
   This is almost the reverse of FindEndOfExpressionInLine, but note
   that the input position and returned position differs by 1.
@@ -1295,22 +1647,72 @@ def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
   Args:
     line: a CleansedLines line.
     endpos: start searching at this position.
-    depth: nesting level at endpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at endpos.
 
   Returns:
-    On finding matching startchar: (index at matching startchar, 0)
-    Otherwise: (-1, new depth at beginning of this line)
+    On finding matching start: (index at matching start, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at beginning of this line)
   """
-  for i in xrange(endpos, -1, -1):
-    if line[i] == endchar:
-      depth += 1
-    elif line[i] == startchar:
-      depth -= 1
-      if depth == 0:
-        return (i, 0)
-  return (-1, depth)
+  i = endpos
+  while i >= 0:
+    char = line[i]
+    if char in ')]}':
+      # Found end of expression, push to expression stack
+      stack.append(char)
+    elif char == '>':
+      # Found potential end of template argument list.
+      #
+      # Ignore it if it's a "->" or ">=" or "operator>"
+      if (i > 0 and
+          (line[i - 1] == '-' or
+           Match(r'\s>=\s', line[i - 1:]) or
+           Search(r'\boperator\s*$', line[0:i]))):
+        i -= 1
+      else:
+        stack.append('>')
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        i -= 1
+      else:
+        # If there is a matching '>', we can pop the expression stack.
+        # Otherwise, ignore this '<' since it must be an operator.
+        if stack and stack[-1] == '>':
+          stack.pop()
+          if not stack:
+            return (i, None)
+    elif char in '([{':
+      # Found start of expression.
+      #
+      # If there are any unmatched '>' on the stack, they must be
+      # operators.  Remove those.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((char == '(' and stack[-1] == ')') or
+          (char == '[' and stack[-1] == ']') or
+          (char == '{' and stack[-1] == '}')):
+        stack.pop()
+        if not stack:
+          return (i, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '<', the matching '>' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+    i -= 1
+
+  return (-1, stack)
 
 
 def ReverseCloseExpression(clean_lines, linenum, pos):
@@ -1331,30 +1733,23 @@ def ReverseCloseExpression(clean_lines, linenum, pos):
     return is the 'cleansed' line at linenum.
   """
   line = clean_lines.elided[linenum]
-  endchar = line[pos]
-  if endchar not in ')}]>':
+  if line[pos] not in ')}]>':
     return (line, 0, -1)
-  if endchar == ')': startchar = '('
-  if endchar == ']': startchar = '['
-  if endchar == '}': startchar = '{'
-  if endchar == '>': startchar = '<'
 
   # Check last line
-  (start_pos, num_open) = FindStartOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
   if start_pos > -1:
     return (line, linenum, start_pos)
 
   # Continue scanning backward
-  while linenum > 0:
+  while stack and linenum > 0:
     linenum -= 1
     line = clean_lines.elided[linenum]
-    (start_pos, num_open) = FindStartOfExpressionInLine(
-        line, len(line) - 1, num_open, startchar, endchar)
+    (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack)
     if start_pos > -1:
       return (line, linenum, start_pos)
 
-  # Did not find startchar before beginning of file, give up
+  # Did not find start of expression before beginning of file, give up
   return (line, 0, -1)
 
 
@@ -1362,7 +1757,7 @@ def CheckForCopyright(filename, lines, error):
   """Logs an error if no Copyright message appears at the top of the file."""
 
   # We'll say it should occur by line 10. Don't forget there's a
-  # dummy line at the front.
+  # placeholder line at the front.
   for line in xrange(1, min(len(lines), 11)):
     if re.search(r'Copyright', lines[line], re.I): break
   else:                       # means no copyright line was found
@@ -1371,6 +1766,46 @@ def CheckForCopyright(filename, lines, error):
           'You should have a line: "Copyright [year] <Copyright Owner>"')
 
 
+def GetIndentLevel(line):
+  """Return the number of leading spaces in line.
+
+  Args:
+    line: A string to check.
+
+  Returns:
+    An integer count of leading spaces, possibly zero.
+  """
+  indent = Match(r'^( *)\S', line)
+  if indent:
+    return len(indent.group(1))
+  else:
+    return 0
+
+def PathSplitToList(path):
+  """Returns the path split into a list by the separator.
+
+  Args:
+    path: An absolute or relative path (e.g. '/a/b/c/' or '../a')
+
+  Returns:
+    A list of path components (e.g. ['a', 'b', 'c]).
+  """
+  lst = []
+  while True:
+    (head, tail) = os.path.split(path)
+    if head == path: # absolute paths end
+      lst.append(head)
+      break
+    if tail == path: # relative paths end
+      lst.append(tail)
+      break
+
+    path = head
+    lst.append(tail)
+
+  lst.reverse()
+  return lst
+
 def GetHeaderGuardCPPVariable(filename):
   """Returns the CPP variable that should be used as a header guard.
 
@@ -1387,15 +1822,67 @@ def GetHeaderGuardCPPVariable(filename):
   # flymake.
   filename = re.sub(r'_flymake\.h$', '.h', filename)
   filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+  # Replace 'c++' with 'cpp'.
+  filename = filename.replace('C++', 'cpp').replace('c++', 'cpp')
 
   fileinfo = FileInfo(filename)
   file_path_from_root = fileinfo.RepositoryName()
-  if _root:
-    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
-  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
 
+  def FixupPathFromRoot():
+    if _root_debug:
+      sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n"
+          %(_root, fileinfo.RepositoryName()))
+
+    # Process the file path with the --root flag if it was set.
+    if not _root:
+      if _root_debug:
+        sys.stderr.write("_root unspecified\n")
+      return file_path_from_root
+
+    def StripListPrefix(lst, prefix):
+      # f(['x', 'y'], ['w, z']) -> None  (not a valid prefix)
+      if lst[:len(prefix)] != prefix:
+        return None
+      # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd']
+      return lst[(len(prefix)):]
+
+    # root behavior:
+    #   --root=subdir , lstrips subdir from the header guard
+    maybe_path = StripListPrefix(PathSplitToList(file_path_from_root),
+                                 PathSplitToList(_root))
+
+    if _root_debug:
+      sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," +
+          " _root=%s)\n") %(maybe_path, file_path_from_root, _root))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
+
+    #   --root=.. , will prepend the outer directory to the header guard
+    full_path = fileinfo.FullName()
+    root_abspath = os.path.abspath(_root)
+
+    maybe_path = StripListPrefix(PathSplitToList(full_path),
+                                 PathSplitToList(root_abspath))
+
+    if _root_debug:
+      sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " +
+          "root_abspath=%s)\n") %(maybe_path, full_path, root_abspath))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
 
-def CheckForHeaderGuard(filename, lines, error):
+    if _root_debug:
+      sys.stderr.write("_root ignore, returning %s\n" %(file_path_from_root))
+
+    #   --root=FAKE_DIR is ignored
+    return file_path_from_root
+
+  file_path_from_root = FixupPathFromRoot()
+  return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, clean_lines, error):
   """Checks that the file contains a header guard.
 
   Logs an error if no #ifndef header guard is present.  For other
@@ -1403,18 +1890,29 @@ def CheckForHeaderGuard(filename, lines, error):
 
   Args:
     filename: The name of the C++ header file.
-    lines: An array of strings, each representing a line of the file.
+    clean_lines: A CleansedLines instance containing the file.
     error: The function to call with any errors found.
   """
 
+  # Don't check for header guards if there are error suppression
+  # comments somewhere in this file.
+  #
+  # Because this is silencing a warning for a nonexistent line, we
+  # only support the very specific NOLINT(build/header_guard) syntax,
+  # and not the general NOLINT or NOLINT(*) syntax.
+  raw_lines = clean_lines.lines_without_raw_strings
+  for i in raw_lines:
+    if Search(r'//\s*NOLINT\(build/header_guard\)', i):
+      return
+
   cppvar = GetHeaderGuardCPPVariable(filename)
 
-  ifndef = None
+  ifndef = ''
   ifndef_linenum = 0
-  define = None
-  endif = None
+  define = ''
+  endif = ''
   endif_linenum = 0
-  for linenum, line in enumerate(lines):
+  for linenum, line in enumerate(raw_lines):
     linesplit = line.split()
     if len(linesplit) >= 2:
       # find the first occurrence of #ifndef and #define, save arg
@@ -1429,18 +1927,12 @@ def CheckForHeaderGuard(filename, lines, error):
       endif = line
       endif_linenum = linenum
 
-  if not ifndef:
+  if not ifndef or not define or ifndef != define:
     error(filename, 0, 'build/header_guard', 5,
           'No #ifndef header guard found, suggested CPP variable is: %s' %
           cppvar)
     return
 
-  if not define:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #define header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
   # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
   # for backward compatibility.
   if ifndef != cppvar:
@@ -1448,26 +1940,69 @@ def CheckForHeaderGuard(filename, lines, error):
     if ifndef != cppvar + '_':
       error_level = 5
 
-    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+    ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum,
                             error)
     error(filename, ifndef_linenum, 'build/header_guard', error_level,
           '#ifndef header guard has wrong style, please use: %s' % cppvar)
 
-  if define != ifndef:
-    error(filename, 0, 'build/header_guard', 5,
-          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
-          cppvar)
+  # Check for "//" comments on endif line.
+  ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
+                          error)
+  match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
+  if match:
+    if match.group(1) == '_':
+      # Issue low severity warning for deprecated double trailing underscore
+      error(filename, endif_linenum, 'build/header_guard', 0,
+            '#endif line should be "#endif  // %s"' % cppvar)
     return
 
-  if endif != ('#endif  // %s' % cppvar):
-    error_level = 0
-    if endif != ('#endif  // %s' % (cppvar + '_')):
-      error_level = 5
+  # Didn't find the corresponding "//" comment.  If this file does not
+  # contain any "//" comments at all, it could be that the compiler
+  # only wants "/**/" comments, look for those instead.
+  no_single_line_comments = True
+  for i in xrange(1, len(raw_lines) - 1):
+    line = raw_lines[i]
+    if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line):
+      no_single_line_comments = False
+      break
 
-    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
-                            error)
-    error(filename, endif_linenum, 'build/header_guard', error_level,
-          '#endif line should be "#endif  // %s"' % cppvar)
+  if no_single_line_comments:
+    match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
+    if match:
+      if match.group(1) == '_':
+        # Low severity warning for double trailing underscore
+        error(filename, endif_linenum, 'build/header_guard', 0,
+              '#endif line should be "#endif  /* %s */"' % cppvar)
+      return
+
+  # Didn't find anything
+  error(filename, endif_linenum, 'build/header_guard', 5,
+        '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckHeaderFileIncluded(filename, include_state, error):
+  """Logs an error if a .cc file does not include its header."""
+
+  # Do not check test files
+  fileinfo = FileInfo(filename)
+  if Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()):
+    return
+
+  headerfile = filename[0:len(filename) - len(fileinfo.Extension())] + '.h'
+  if not os.path.exists(headerfile):
+    return
+  headername = FileInfo(headerfile).RepositoryName()
+  first_include = 0
+  for section_list in include_state.include_list:
+    for f in section_list:
+      if headername in f[0] or f[0] in headername:
+        return
+      if not first_include:
+        first_include = f[1]
+
+  error(filename, first_include, 'build/include', 5,
+        '%s should include its header file %s' % (fileinfo.RepositoryName(),
+                                                  headername))
 
 
 def CheckForBadCharacters(filename, lines, error):
@@ -1551,19 +2086,33 @@ def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
           'Use C++11 raw strings or concatenation instead.')
 
 
-threading_list = (
-    ('asctime(', 'asctime_r('),
-    ('ctime(', 'ctime_r('),
-    ('getgrgid(', 'getgrgid_r('),
-    ('getgrnam(', 'getgrnam_r('),
-    ('getlogin(', 'getlogin_r('),
-    ('getpwnam(', 'getpwnam_r('),
-    ('getpwuid(', 'getpwuid_r('),
-    ('gmtime(', 'gmtime_r('),
-    ('localtime(', 'localtime_r('),
-    ('rand(', 'rand_r('),
-    ('strtok(', 'strtok_r('),
-    ('ttyname(', 'ttyname_r('),
+# (non-threadsafe name, thread-safe alternative, validation pattern)
+#
+# The validation pattern is used to eliminate false positives such as:
+#  _rand();               // false positive due to substring match.
+#  ->rand();              // some member function rand().
+#  ACMRandom rand(seed);  // some variable named rand.
+#  ISAACRandom rand();    // another variable named rand.
+#
+# Basically we require the return value of these functions to be used
+# in some expression context on the same line by matching on some
+# operator before the function name.  This eliminates constructors and
+# member function calls.
+_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)'
+_THREADING_LIST = (
+    ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'),
+    ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'),
+    ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'),
+    ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'),
+    ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'),
+    ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'),
+    ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'),
+    ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'),
+    ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'),
+    ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'),
+    ('strtok(', 'strtok_r(',
+     _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
+    ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'),
     )
 
 
@@ -1583,14 +2132,13 @@ def CheckPosixThreading(filename, clean_lines, linenum, error):
     error: The function to call with any errors found.
   """
   line = clean_lines.elided[linenum]
-  for single_thread_function, multithread_safe_function in threading_list:
-    ix = line.find(single_thread_function)
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
-                                line[ix - 1] not in ('_', '.', '>'))):
+  for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
+    # Additional pattern matching check to confirm that this is the
+    # function we are looking for
+    if Search(pattern, line):
       error(filename, linenum, 'runtime/threadsafe_fn', 2,
-            'Consider using ' + multithread_safe_function +
-            '...) instead of ' + single_thread_function +
+            'Consider using ' + multithread_safe_func +
+            '...) instead of ' + single_thread_func +
             '...) for improved thread safety.')
 
 
@@ -1612,7 +2160,6 @@ def CheckVlogArguments(filename, clean_lines, linenum, error):
           'VLOG() should be used with numeric verbosity level.  '
           'Use LOG() if you want symbolic severity levels.')
 
-
 # Matches invalid increment: *count++, which moves pointer instead of
 # incrementing a value.
 _RE_PATTERN_INVALID_INCREMENT = re.compile(
@@ -1641,13 +2188,29 @@ def CheckInvalidIncrement(filename, clean_lines, linenum, error):
           'Changing pointer instead of value (or unused value of operator*).')
 
 
+def IsMacroDefinition(clean_lines, linenum):
+  if Search(r'^#define', clean_lines[linenum]):
+    return True
+
+  if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
+    return True
+
+  return False
+
+
+def IsForwardClassDeclaration(clean_lines, linenum):
+  return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
+
+
 class _BlockInfo(object):
   """Stores information about a generic block of code."""
 
-  def __init__(self, seen_open_brace):
+  def __init__(self, linenum, seen_open_brace):
+    self.starting_linenum = linenum
     self.seen_open_brace = seen_open_brace
     self.open_parentheses = 0
     self.inline_asm = _NO_ASM
+    self.check_namespace_indentation = False
 
   def CheckBegin(self, filename, clean_lines, linenum, error):
     """Run checks that applies to text up to the opening brace.
@@ -1677,15 +2240,33 @@ def CheckEnd(self, filename, clean_lines, linenum, error):
     """
     pass
 
+  def IsBlockInfo(self):
+    """Returns true if this block is a _BlockInfo.
+
+    This is convenient for verifying that an object is an instance of
+    a _BlockInfo, but not an instance of any of the derived classes.
+
+    Returns:
+      True for this class, False for derived classes.
+    """
+    return self.__class__ == _BlockInfo
+
+
+class _ExternCInfo(_BlockInfo):
+  """Stores information about an 'extern "C"' block."""
+
+  def __init__(self, linenum):
+    _BlockInfo.__init__(self, linenum, True)
+
 
 class _ClassInfo(_BlockInfo):
   """Stores information about a class."""
 
   def __init__(self, name, class_or_struct, clean_lines, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name
-    self.starting_linenum = linenum
     self.is_derived = False
+    self.check_namespace_indentation = True
     if class_or_struct == 'struct':
       self.access = 'public'
       self.is_struct = True
@@ -1695,11 +2276,7 @@ def __init__(self, name, class_or_struct, clean_lines, linenum):
 
     # Remember initial indentation level for this class.  Using raw_lines here
     # instead of elided to account for leading comments.
-    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
-    if initial_indent:
-      self.class_indent = len(initial_indent.group(1))
-    else:
-      self.class_indent = 0
+    self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
 
     # Try to find the end of the class.  This will be confused by things like:
     #   class A {
@@ -1721,6 +2298,23 @@ def CheckBegin(self, filename, clean_lines, linenum, error):
       self.is_derived = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
+    # If there is a DISALLOW macro, it should appear near the end of
+    # the class.
+    seen_last_thing_in_class = False
+    for i in xrange(linenum - 1, self.starting_linenum, -1):
+      match = Search(
+          r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' +
+          self.name + r'\)',
+          clean_lines.elided[i])
+      if match:
+        if seen_last_thing_in_class:
+          error(filename, i, 'readability/constructors', 3,
+                match.group(1) + ' should be the last thing in the class')
+        break
+
+      if not Match(r'^\s*$', clean_lines.elided[i]):
+        seen_last_thing_in_class = True
+
     # Check that closing brace is aligned with beginning of the class.
     # Only do this if the closing brace is indented by only whitespaces.
     # This means we will not check single-line class definitions.
@@ -1738,9 +2332,9 @@ class _NamespaceInfo(_BlockInfo):
   """Stores information about a namespace."""
 
   def __init__(self, name, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name or ''
-    self.starting_linenum = linenum
+    self.check_namespace_indentation = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
     """Check end of namespace comments."""
@@ -1758,7 +2352,7 @@ def CheckEnd(self, filename, clean_lines, linenum, error):
     # deciding what these nontrivial things are, so this check is
     # triggered by namespace size only, which works most of the time.
     if (linenum - self.starting_linenum < 10
-        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+        and not Match(r'^\s*};*\s*(//|/\*).*\bnamespace\b', line)):
       return
 
     # Look for matching comment at end of namespace.
@@ -1775,17 +2369,24 @@ def CheckEnd(self, filename, clean_lines, linenum, error):
     # expected namespace.
     if self.name:
       # Named namespace
-      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
-                    r'[\*/\.\\\s]*$'),
+      if not Match((r'^\s*};*\s*(//|/\*).*\bnamespace\s+' +
+                    re.escape(self.name) + r'[\*/\.\\\s]*$'),
                    line):
         error(filename, linenum, 'readability/namespace', 5,
               'Namespace should be terminated with "// namespace %s"' %
               self.name)
     else:
       # Anonymous namespace
-      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace"')
+      if not Match(r'^\s*};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        # If "// namespace anonymous" or "// anonymous namespace (more text)",
+        # mention "// anonymous namespace" as an acceptable form
+        if Match(r'^\s*}.*\b(namespace anonymous|anonymous namespace)\b', line):
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"'
+                ' or "// anonymous namespace"')
+        else:
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"')
 
 
 class _PreprocessorInfo(object):
@@ -1802,7 +2403,7 @@ def __init__(self, stack_before_if):
     self.seen_else = False
 
 
-class _NestingState(object):
+class NestingState(object):
   """Holds states related to parsing braces."""
 
   def __init__(self):
@@ -1814,6 +2415,17 @@ def __init__(self):
     # - _BlockInfo: some other type of block.
     self.stack = []
 
+    # Top of the previous stack before each Update().
+    #
+    # Because the nesting_stack is updated at the end of each line, we
+    # had to do some convoluted checks to find out what is the current
+    # scope at the beginning of the line.  This check is simplified by
+    # saving the previous top of nesting stack.
+    #
+    # We could save the full stack, but we only need the top.  Copying
+    # the full nesting stack would slow down cpplint by ~10%.
+    self.previous_stack_top = []
+
     # Stack of _PreprocessorInfo objects.
     self.pp_stack = []
 
@@ -1834,6 +2446,82 @@ def InNamespaceBody(self):
     """
     return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
 
+  def InExternC(self):
+    """Check if we are currently one level inside an 'extern "C"' block.
+
+    Returns:
+      True if top of the stack is an extern block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ExternCInfo)
+
+  def InClassDeclaration(self):
+    """Check if we are currently one level inside a class or struct declaration.
+
+    Returns:
+      True if top of the stack is a class/struct, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ClassInfo)
+
+  def InAsmBlock(self):
+    """Check if we are currently one level inside an inline ASM block.
+
+    Returns:
+      True if the top of the stack is a block containing inline ASM.
+    """
+    return self.stack and self.stack[-1].inline_asm != _NO_ASM
+
+  def InTemplateArgumentList(self, clean_lines, linenum, pos):
+    """Check if current position is inside template argument list.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      pos: position just after the suspected template argument.
+    Returns:
+      True if (linenum, pos) is inside template arguments.
+    """
+    while linenum < clean_lines.NumLines():
+      # Find the earliest character that might indicate a template argument
+      line = clean_lines.elided[linenum]
+      match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
+      if not match:
+        linenum += 1
+        pos = 0
+        continue
+      token = match.group(1)
+      pos += len(match.group(0))
+
+      # These things do not look like template argument list:
+      #   class Suspect {
+      #   class Suspect x; }
+      if token in ('{', '}', ';'): return False
+
+      # These things look like template argument list:
+      #   template <class Suspect>
+      #   template <class Suspect = default_value>
+      #   template <class Suspect[]>
+      #   template <class Suspect...>
+      if token in ('>', '=', '[', ']', '.'): return True
+
+      # Check if token is an unmatched '<'.
+      # If not, move on to the next character.
+      if token != '<':
+        pos += 1
+        if pos >= len(line):
+          linenum += 1
+          pos = 0
+        continue
+
+      # We can't be sure if we just find a single '<', and need to
+      # find the matching '>'.
+      (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1)
+      if end_pos < 0:
+        # Not sure if template argument list or syntax error in file
+        return False
+      linenum = end_line
+      pos = end_pos
+    return False
+
   def UpdatePreprocessor(self, line):
     """Update preprocessor stack.
 
@@ -1890,6 +2578,7 @@ def UpdatePreprocessor(self, line):
         # TODO(unknown): unexpected #endif, issue warning?
         pass
 
+  # TODO(unknown): Update() is too long, but we will refactor later.
   def Update(self, filename, clean_lines, linenum, error):
     """Update nesting state with current line.
 
@@ -1901,7 +2590,17 @@ def Update(self, filename, clean_lines, linenum, error):
     """
     line = clean_lines.elided[linenum]
 
-    # Update pp_stack first
+    # Remember top of the previous nesting stack.
+    #
+    # The stack is always pushed/popped and not modified in place, so
+    # we can just do a shallow copy instead of copy.deepcopy.  Using
+    # deepcopy would slow down cpplint by ~28%.
+    if self.stack:
+      self.previous_stack_top = self.stack[-1]
+    else:
+      self.previous_stack_top = None
+
+    # Update pp_stack
     self.UpdatePreprocessor(line)
 
     # Count parentheses.  This is to avoid adding struct arguments to
@@ -1952,32 +2651,27 @@ def Update(self, filename, clean_lines, linenum, error):
     # such as in:
     #   class LOCKABLE API Object {
     #   };
-    #
-    # Templates with class arguments may confuse the parser, for example:
-    #   template <class T
-    #             class Comparator = less<T>,
-    #             class Vector = vector<T> >
-    #   class HeapQueue {
-    #
-    # Because this parser has no nesting state about templates, by the
-    # time it saw "class Comparator", it may think that it's a new class.
-    # Nested templates have a similar problem:
-    #   template <
-    #       typename ExportedType,
-    #       typename TupleType,
-    #       template <typename, typename> class ImplTemplate>
-    #
-    # To avoid these cases, we ignore classes that are followed by '=' or '>'
     class_decl_match = Match(
-        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
-        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
-        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+        r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
+        r'(.*)$', line)
     if (class_decl_match and
         (not self.stack or self.stack[-1].open_parentheses == 0)):
-      self.stack.append(_ClassInfo(
-          class_decl_match.group(4), class_decl_match.group(2),
-          clean_lines, linenum))
-      line = class_decl_match.group(5)
+      # We do not want to accept classes that are actually template arguments:
+      #   template <class Ignore1,
+      #             class Ignore2 = Default<Args>,
+      #             template <Args> class Ignore3>
+      #   void Function() {};
+      #
+      # To avoid template argument cases, we scan forward and look for
+      # an unmatched '>'.  If we see one, assume we are inside a
+      # template argument list.
+      end_declaration = len(class_decl_match.group(1))
+      if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration):
+        self.stack.append(_ClassInfo(
+            class_decl_match.group(3), class_decl_match.group(2),
+            clean_lines, linenum))
+        line = class_decl_match.group(4)
 
     # If we have not yet seen the opening brace for the innermost block,
     # run checks here.
@@ -2024,10 +2718,13 @@ def Update(self, filename, clean_lines, linenum, error):
         # stack otherwise.
         if not self.SeenOpenBrace():
           self.stack[-1].seen_open_brace = True
+        elif Match(r'^extern\s*"[^"]*"\s*\{', line):
+          self.stack.append(_ExternCInfo(linenum))
         else:
-          self.stack.append(_BlockInfo(True))
+          self.stack.append(_BlockInfo(linenum, True))
           if _MATCH_ASM.match(line):
             self.stack[-1].inline_asm = _BLOCK_ASM
+
       elif token == ';' or token == ')':
         # If we haven't seen an opening brace yet, but we already saw
         # a semicolon, this is probably a forward declaration.  Pop
@@ -2103,7 +2800,7 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -2136,7 +2833,8 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
             r'\s+(register|static|extern|typedef)\b',
             line):
     error(filename, linenum, 'build/storage_class', 5,
-          'Storage class (static, extern, typedef, etc) should be first.')
+          'Storage-class specifier (static, extern, typedef, etc) should be '
+          'at the beginning of the declaration.')
 
   if Match(r'\s*#\s*endif\s*[^/\s]+', line):
     error(filename, linenum, 'build/endif_comment', 5,
@@ -2176,26 +2874,79 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
 
   # Look for single-argument constructors that aren't marked explicit.
   # Technically a valid construct, but against style.
-  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
-               % re.escape(base_classname),
-               line)
-  if (args and
-      args.group(1) != 'void' and
-      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
-                % re.escape(base_classname), args.group(1).strip())):
-    error(filename, linenum, 'runtime/explicit', 5,
-          'Single-argument constructors should be marked explicit.')
-
-
-def CheckSpacingForFunctionCall(filename, line, linenum, error):
+  explicit_constructor_match = Match(
+      r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?'
+      r'(?:(?:inline|constexpr)\s+)*%s\s*'
+      r'\(((?:[^()]|\([^()]*\))*)\)'
+      % re.escape(base_classname),
+      line)
+
+  if explicit_constructor_match:
+    is_marked_explicit = explicit_constructor_match.group(1)
+
+    if not explicit_constructor_match.group(2):
+      constructor_args = []
+    else:
+      constructor_args = explicit_constructor_match.group(2).split(',')
+
+    # collapse arguments so that commas in template parameter lists and function
+    # argument parameter lists don't split arguments in two
+    i = 0
+    while i < len(constructor_args):
+      constructor_arg = constructor_args[i]
+      while (constructor_arg.count('<') > constructor_arg.count('>') or
+             constructor_arg.count('(') > constructor_arg.count(')')):
+        constructor_arg += ',' + constructor_args[i + 1]
+        del constructor_args[i + 1]
+      constructor_args[i] = constructor_arg
+      i += 1
+
+    defaulted_args = [arg for arg in constructor_args if '=' in arg]
+    noarg_constructor = (not constructor_args or  # empty arg list
+                         # 'void' arg specifier
+                         (len(constructor_args) == 1 and
+                          constructor_args[0].strip() == 'void'))
+    onearg_constructor = ((len(constructor_args) == 1 and  # exactly one arg
+                           not noarg_constructor) or
+                          # all but at most one arg defaulted
+                          (len(constructor_args) >= 1 and
+                           not noarg_constructor and
+                           len(defaulted_args) >= len(constructor_args) - 1))
+    initializer_list_constructor = bool(
+        onearg_constructor and
+        Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
+    copy_constructor = bool(
+        onearg_constructor and
+        Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&'
+              % re.escape(base_classname), constructor_args[0].strip()))
+
+    if (not is_marked_explicit and
+        onearg_constructor and
+        not initializer_list_constructor and
+        not copy_constructor):
+      if defaulted_args:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Constructors callable with one argument '
+              'should be marked explicit.')
+      else:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Single-parameter constructors should be marked explicit.')
+    elif is_marked_explicit and not onearg_constructor:
+      if noarg_constructor:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Zero-parameter constructors should not be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
   """Checks for the correctness of various spacing around function calls.
 
   Args:
     filename: The name of the current file.
-    line: The text of the line to check.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
+  line = clean_lines.elided[linenum]
 
   # Since function calls often occur inside if/for/while/switch
   # expressions - which have their own, more liberal conventions - we
@@ -2238,10 +2989,18 @@ def CheckSpacingForFunctionCall(filename, line, linenum, error):
       error(filename, linenum, 'whitespace/parens', 2,
             'Extra space after (')
     if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef', fncall) and
-        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space before ( in function call')
+        not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and
+        not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
+        not Search(r'\bcase\s+\(', fncall)):
+      # TODO(unknown): Space after an operator function seem to be a common
+      # error, silence those for now by restricting them to highest verbosity.
+      if Search(r'\boperator_*\b', line):
+        error(filename, linenum, 'whitespace/parens', 0,
+              'Extra space before ( in function call')
+      else:
+        error(filename, linenum, 'whitespace/parens', 4,
+              'Extra space before ( in function call')
     # If the ) is followed only by a newline or a { + newline, assume it's
     # part of a control statement (if/while/etc), and don't complain
     if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
@@ -2270,12 +3029,26 @@ def IsBlankLine(line):
   return not line or line.isspace()
 
 
+def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                                 error):
+  is_namespace_indent_item = (
+      len(nesting_state.stack) > 1 and
+      nesting_state.stack[-1].check_namespace_indentation and
+      isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
+      nesting_state.previous_stack_top == nesting_state.stack[-2])
+
+  if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                     clean_lines.elided, line):
+    CheckItemIndentationInNamespace(filename, clean_lines.elided,
+                                    line, error)
+
+
 def CheckForFunctionLengths(filename, clean_lines, linenum,
                             function_state, error):
   """Reports for long function bodies.
 
   For an overview why this is done, see:
-  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+  https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
 
   Uses a simplistic algorithm assuming other style guidelines
   (especially spacing) are followed.
@@ -2295,8 +3068,6 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
   """
   lines = clean_lines.lines
   line = lines[linenum]
-  raw = clean_lines.raw_lines
-  raw_line = raw[linenum]
   joined_line = ''
 
   starting_func = False
@@ -2343,190 +3114,58 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
 _RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
 
 
-def CheckComment(comment, filename, linenum, error):
-  """Checks for common mistakes in TODO comments.
+def CheckComment(line, filename, linenum, next_line_start, error):
+  """Checks for common mistakes in comments.
 
   Args:
-    comment: The text of the comment from the line in question.
+    line: The line in question.
     filename: The name of the current file.
     linenum: The number of the line to check.
+    next_line_start: The first non-whitespace column of the next line.
     error: The function to call with any errors found.
   """
-  match = _RE_PATTERN_TODO.match(comment)
-  if match:
-    # One whitespace is correct; zero whitespace is handled elsewhere.
-    leading_whitespace = match.group(1)
-    if len(leading_whitespace) > 1:
-      error(filename, linenum, 'whitespace/todo', 2,
-            'Too many spaces before TODO')
-
-    username = match.group(2)
-    if not username:
-      error(filename, linenum, 'readability/todo', 2,
-            'Missing username in TODO; it should look like '
-            '"// TODO(my_username): Stuff."')
-
-    middle_whitespace = match.group(3)
-    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
-    if middle_whitespace != ' ' and middle_whitespace != '':
-      error(filename, linenum, 'whitespace/todo', 2,
-            'TODO(my_username) should be followed by a space')
-
-def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for improper use of DISALLOW* macros.
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    if re.sub(r'\\.', '', line[0:commentpos]).count('"') % 2 == 0:
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
 
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
-                   r'DISALLOW_EVIL_CONSTRUCTORS|'
-                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
-  if not matched:
-    return
-  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
-    if nesting_state.stack[-1].access != 'private':
-      error(filename, linenum, 'readability/constructors', 3,
-            '%s must be in the private: section' % matched.group(1))
-
-  else:
-    # Found DISALLOW* macro outside a class declaration, or perhaps it
-    # was used inside a function when it should have been part of the
-    # class declaration.  We could issue a warning here, but it
-    # probably resulted in a compiler error already.
-    pass
-
-
-def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
-  """Find the corresponding > to close a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_suffix: Remainder of the current line after the initial <.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_suffix
-  nesting_stack = ['<']
-  while True:
-    # Find the next operator that can tell us whether < is used as an
-    # opening bracket or as a less-than operator.  We only want to
-    # warn on the latter case.
-    #
-    # We could also check all other operators and terminate the search
-    # early, e.g. if we got something like this "a<b+c", the "<" is
-    # most likely a less-than operator, but then we will get false
-    # positives for default arguments and other template expressions.
-    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(1)
-      line = match.group(2)
-
-      if nesting_stack[-1] == '<':
-        # Expecting closing angle bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator == '>':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma after a bracket, this is most likely a template
-          # argument.  We have not seen a closing angle bracket yet, but
-          # it's probably a few lines later if we look for it, so just
-          # return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting closing parenthesis or closing bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator in (')', ']'):
-          # We don't bother checking for matching () or [].  If we got
-          # something like (] or [), it would have been a syntax error.
-          nesting_stack.pop()
-
-    else:
-      # Scan the next line
-      linenum += 1
-      if linenum >= len(clean_lines.elided):
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all remaining lines and still no matching angle bracket.
-  # Most likely the input was incomplete, otherwise we should have
-  # seen a semicolon and returned early.
-  return True
-
-
-def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
-  """Find the corresponding < that started a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_prefix: Part of the current line before the initial >.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_prefix
-  nesting_stack = ['>']
-  while True:
-    # Find the previous operator
-    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(2)
-      line = match.group(1)
-
-      if nesting_stack[-1] == '>':
-        # Expecting opening angle bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator == '<':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma before a bracket, this is most likely a
-          # template argument.  The opening angle bracket is probably
-          # there if we look for it, so just return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting opening parenthesis or opening bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator in ('(', '['):
-          nesting_stack.pop()
-
-    else:
-      # Scan the previous line
-      linenum -= 1
-      if linenum < 0:
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all earlier lines and still no matching angle bracket.
-  return False
+      # Checks for common mistakes in TODO comments.
+      comment = line[commentpos:]
+      match = _RE_PATTERN_TODO.match(comment)
+      if match:
+        # One whitespace is correct; zero whitespace is handled elsewhere.
+        leading_whitespace = match.group(1)
+        if len(leading_whitespace) > 1:
+          error(filename, linenum, 'whitespace/todo', 2,
+                'Too many spaces before TODO')
+
+        username = match.group(2)
+        if not username:
+          error(filename, linenum, 'readability/todo', 2,
+                'Missing username in TODO; it should look like '
+                '"// TODO(my_username): Stuff."')
+
+        middle_whitespace = match.group(3)
+        # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+        if middle_whitespace != ' ' and middle_whitespace != '':
+          error(filename, linenum, 'whitespace/todo', 2,
+                'TODO(my_username) should be followed by a space')
+
+      # If the comment contains an alphanumeric character, there
+      # should be a space somewhere between it and the // unless
+      # it's a /// or //! Doxygen comment.
+      if (Match(r'//[^ ]*\w', comment) and
+          not Match(r'(///|//\!)(\s+|$)', comment)):
+        error(filename, linenum, 'whitespace/comments', 4,
+              'Should have a space between // and comment')
 
 
 def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
@@ -2542,7 +3181,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -2565,7 +3204,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   #   }
   #
   # A warning about missing end of namespace comments will be issued instead.
-  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+  #
+  # Also skip blank line checks for 'extern "C"' blocks, which are formatted
+  # like namespaces.
+  if (IsBlankLine(line) and
+      not nesting_state.InNamespaceBody() and
+      not nesting_state.InExternC()):
     elided = clean_lines.elided
     prev_line = elided[linenum - 1]
     prevbrace = prev_line.rfind('{')
@@ -2628,54 +3272,64 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
       error(filename, linenum, 'whitespace/blank_line', 3,
             'Do not leave a blank line after "%s:"' % matched.group(1))
 
-  # Next, we complain if there's a comment too near the text
-  commentpos = line.find('//')
-  if commentpos != -1:
-    # Check if the // may be in quotes.  If so, ignore it
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if (line.count('"', 0, commentpos) -
-        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
-      # Allow one space for new scopes, two spaces otherwise:
-      if (not Match(r'^\s*{ //', line) and
-          ((commentpos >= 1 and
-            line[commentpos-1] not in string.whitespace) or
-           (commentpos >= 2 and
-            line[commentpos-2] not in string.whitespace))):
-        error(filename, linenum, 'whitespace/comments', 2,
-              'At least two spaces is best between code and comments')
-      # There should always be a space between the // and the comment
-      commentend = commentpos + 2
-      if commentend < len(line) and not line[commentend] == ' ':
-        # but some lines are exceptions -- e.g. if they're big
-        # comment delimiters like:
-        # //----------------------------------------------------------
-        # or are an empty C++ style Doxygen comment, like:
-        # ///
-        # or C++ style Doxygen comments placed after the variable:
-        # ///<  Header comment
-        # //!<  Header comment
-        # or they begin with multiple slashes followed by a space:
-        # //////// Header comment
-        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
-                 Search(r'^/$', line[commentend:]) or
-                 Search(r'^!< ', line[commentend:]) or
-                 Search(r'^/< ', line[commentend:]) or
-                 Search(r'^/+ ', line[commentend:]))
-        if not match:
-          error(filename, linenum, 'whitespace/comments', 4,
-                'Should have a space between // and comment')
-      CheckComment(line[commentpos:], filename, linenum, error)
-
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  # Don't try to do spacing checks for operator methods
-  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+  # Next, check comments
+  next_line_start = 0
+  if linenum + 1 < clean_lines.NumLines():
+    next_line = raw[linenum + 1]
+    next_line_start = len(next_line) - len(next_line.lstrip())
+  CheckComment(line, filename, linenum, next_line_start, error)
+
+  # get rid of comments and strings
+  line = clean_lines.elided[linenum]
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'.
+  if Search(r'\w\s+\[', line) and not Search(r'(?:auto&?|delete|return)\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search(r'for *\(.*[^:]:[^: ]', line) or
+      Search(r'for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckOperatorSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around operators.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Don't try to do spacing checks for operator methods.  Do this by
+  # replacing the troublesome characters with something else,
+  # preserving column position for all other characters.
+  #
+  # The replacement is done repeatedly to avoid false positives from
+  # operators that call operators.
+  while True:
+    match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
+    if match:
+      line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
+    else:
+      break
 
   # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
   # Otherwise not.  Note we only check for non-spaces on *both* sides;
   # sometimes people put non-spaces on one side when aligning ='s among
   # many lines (not that this is behavior that I approve of...)
-  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+  if ((Search(r'[\w.]=', line) or
+       Search(r'=[\w.]', line))
+      and not Search(r'\b(if|while|for) ', line)
+      # Operators taken from [lex.operators] in C++11 standard.
+      and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line)
+      and not Search(r'operator=', line)):
     error(filename, linenum, 'whitespace/operators', 4,
           'Missing spaces around =')
 
@@ -2687,42 +3341,51 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   #
   # Check <= and >= first to avoid false positives with < and >, then
   # check non-include lines for spacing around < and >.
-  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  #
+  # If the operator is followed by a comma, assume it's be used in a
+  # macro context and don't do any checks.  This avoids false
+  # positives.
+  #
+  # Note that && is not included here.  This is because there are too
+  # many false positives due to RValue references.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
   if match:
     error(filename, linenum, 'whitespace/operators', 3,
           'Missing spaces around %s' % match.group(1))
-  # We allow no-spaces around << when used like this: 10<<20, but
-  # not otherwise (particularly, not when used as streams)
-  # Also ignore using ns::operator<<;
-  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
-  if (match and
-      not (match.group(1).isdigit() and match.group(2).isdigit()) and
-      not (match.group(1) == 'operator' and match.group(2) == ';')):
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around <<')
   elif not Match(r'#.*include', line):
-    # Avoid false positives on ->
-    reduced_line = line.replace('->', '')
-
     # Look for < that is not surrounded by spaces.  This is only
     # triggered if both sides are missing spaces, even though
     # technically should should flag if at least one side is missing a
     # space.  This is done to avoid some false positives with shifts.
-    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
-    if (match and
-        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around <')
+    match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
+    if match:
+      (_, _, end_pos) = CloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if end_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around <')
 
     # Look for > that is not surrounded by spaces.  Similar to the
     # above, we only trigger if both sides are missing spaces to avoid
     # false positives with shifts.
-    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
-    if (match and
-        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
-                                             match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around >')
+    match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
+    if match:
+      (_, _, start_pos) = ReverseCloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if start_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around >')
+
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  #
+  # We also allow operators following an opening parenthesis, since
+  # those tend to be macros that deal with operators.
+  match = Search(r'(operator|[^\s(<])(?:L|UL|LL|ULL|l|ul|ll|ull)?<<([^\s,=<])', line)
+  if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
 
   # We allow no-spaces around >> for almost anything.  This is because
   # C++11 allows ">>" to close nested templates, which accounts for
@@ -2747,7 +3410,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/operators', 4,
           'Extra space for operator %s' % match.group(1))
 
-  # A pet peeve of mine: no spaces after an if, while, switch, or for
+
+def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around parentheses.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # No spaces after an if, while, switch, or for
   match = Search(r' (if\(|for\(|while\(|switch\()', line)
   if match:
     error(filename, linenum, 'whitespace/parens', 5,
@@ -2773,6 +3448,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
             'Should have zero or one spaces inside ( and ) in %s' %
             match.group(1))
 
+
+def CheckCommaSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing near commas and semicolons.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  raw = clean_lines.lines_without_raw_strings
+  line = clean_lines.elided[linenum]
+
   # You should always have a space after a comma (either as fn arg or operator)
   #
   # This does not apply when the non-space character following the
@@ -2783,7 +3471,8 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   # verify that lines contain missing whitespaces, second pass on raw
   # lines to confirm that those missing whitespaces are not due to
   # elided comments.
-  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+  if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
+      Search(r',[^,\s]', raw[linenum])):
     error(filename, linenum, 'whitespace/comma', 3,
           'Missing space after ,')
 
@@ -2795,14 +3484,91 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/semicolon', 3,
           'Missing space after ;')
 
-  # Next we will look for issues with function calls.
-  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+def _IsType(clean_lines, nesting_state, expr):
+  """Check if expression looks like a type name, returns true if so.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    expr: The expression to check.
+  Returns:
+    True, if token looks like a type.
+  """
+  # Keep only the last token in the expression
+  last_word = Match(r'^.*(\b\S+)$', expr)
+  if last_word:
+    token = last_word.group(1)
+  else:
+    token = expr
+
+  # Match native types and stdint types
+  if _TYPES.match(token):
+    return True
+
+  # Try a bit harder to match templated types.  Walk up the nesting
+  # stack until we find something that resembles a typename
+  # declaration for what we are looking for.
+  typename_pattern = (r'\b(?:typename|class|struct)\s+' + re.escape(token) +
+                      r'\b')
+  block_index = len(nesting_state.stack) - 1
+  while block_index >= 0:
+    if isinstance(nesting_state.stack[block_index], _NamespaceInfo):
+      return False
+
+    # Found where the opening brace is.  We want to scan from this
+    # line up to the beginning of the function, minus a few lines.
+    #   template <typename Type1,  // stop scanning here
+    #             ...>
+    #   class C
+    #     : public ... {  // start scanning here
+    last_line = nesting_state.stack[block_index].starting_linenum
+
+    next_block_start = 0
+    if block_index > 0:
+      next_block_start = nesting_state.stack[block_index - 1].starting_linenum
+    first_line = last_line
+    while first_line >= next_block_start:
+      if clean_lines.elided[first_line].find('template') >= 0:
+        break
+      first_line -= 1
+    if first_line < next_block_start:
+      # Didn't find any "template" keyword before reaching the next block,
+      # there are probably no template things to check for this block
+      block_index -= 1
+      continue
+
+    # Look for typename in the specified range
+    for i in xrange(first_line, last_line + 1, 1):
+      if Search(typename_pattern, clean_lines.elided[i]):
+        return True
+    block_index -= 1
+
+  return False
+
+
+def CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for horizontal spacing near commas.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
 
   # Except after an opening paren, or after another opening brace (in case of
   # an initializer list, for instance), you should have spaces before your
-  # braces. And since you should never have braces at the beginning of a line,
-  # this is an easy test.
-  match = Match(r'^(.*[^ ({]){', line)
+  # braces when they are delimiting blocks, classes, namespaces etc.
+  # And since you should never have braces at the beginning of a line,
+  # this is an easy test.  Except that braces used for initialization don't
+  # follow the same rule; we often don't want spaces before those.
+  match = Match(r'^(.*[^ ({>]){', line)
+
   if match:
     # Try a bit harder to check for brace initialization.  This
     # happens in one of the following forms:
@@ -2813,10 +3579,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     #   LastArgument(..., type{});
     #   LOG(INFO) << type{} << " ...";
     #   map_of_type[{...}] = ...;
+    #   ternary = expr ? new type{} : nullptr;
+    #   OuterTemplate<InnerTemplateConstructor<Type>{}>
     #
     # We check for the character following the closing brace, and
     # silence the warning if it's one of those listed above, i.e.
-    # "{.;,)<]".
+    # "{.;,)<>]:".
     #
     # To account for nested initializer list, we allow any number of
     # closing braces up to "{;,)<".  We can't simply silence the
@@ -2830,6 +3598,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     # There is a false negative with this approach if people inserted
     # spurious semicolons, e.g. "if (cond){};", but we will catch the
     # spurious semicolon with a separate check.
+    leading_text = match.group(1)
     (endline, endlinenum, endpos) = CloseExpression(
         clean_lines, linenum, len(match.group(1)))
     trailing_text = ''
@@ -2838,7 +3607,11 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     for offset in xrange(endlinenum + 1,
                          min(endlinenum + 3, clean_lines.NumLines() - 1)):
       trailing_text += clean_lines.elided[offset]
-    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+    # We also suppress warnings for `uint64_t{expression}` etc., as the style
+    # guide recommends brace initialization for integral types to avoid
+    # overflow/truncation.
+    if (not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text)
+        and not _IsType(clean_lines, nesting_state, leading_text)):
       error(filename, linenum, 'whitespace/braces', 5,
             'Missing space before {')
 
@@ -2847,12 +3620,6 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/braces', 5,
           'Missing space before else')
 
-  # You shouldn't have spaces before your brackets, except maybe after
-  # 'delete []' or 'new char * []'.
-  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Extra space before [')
-
   # You shouldn't have a space before a semicolon at the end of the line.
   # There's a special case for "for" since the style guide allows space before
   # the semicolon there.
@@ -2869,12 +3636,23 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
           'Extra space before last semicolon. If this should be an empty '
           'statement, use {} instead.')
 
-  # In range-based for, we wanted spaces before and after the colon, but
-  # not around "::" tokens that might appear.
-  if (Search('for *\(.*[^:]:[^: ]', line) or
-      Search('for *\(.*[^: ]:[^:]', line)):
-    error(filename, linenum, 'whitespace/forcolon', 2,
-          'Missing space around colon in range-based for loop')
+
+def IsDecltype(clean_lines, linenum, column):
+  """Check if the token ending on (linenum, column) is decltype().
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: the number of the line to check.
+    column: end column of the token to check.
+  Returns:
+    True if this token is decltype() expression, False otherwise.
+  """
+  (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
+  if start_col < 0:
+    return False
+  if Search(r'\bdecltype\s*$', text[0:start_col]):
+    return True
+  return False
 
 
 def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
@@ -2974,15 +3752,18 @@ def CheckBraces(filename, clean_lines, linenum, error):
     # used for brace initializers inside function calls.  We don't detect this
     # perfectly: we just don't complain if the last non-whitespace character on
     # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
-    # previous line starts a preprocessor block.
+    # previous line starts a preprocessor block. We also allow a brace on the
+    # following line if it is part of an array initialization and would not fit
+    # within the 80 character limit of the preceding line.
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if (not Search(r'[,;:}{(]\s*$', prevline) and
-        not Match(r'\s*#', prevline)):
+        not Match(r'\s*#', prevline) and
+        not (GetLineWidth(prevline) > _line_length - 2 and '[]' in prevline)):
       error(filename, linenum, 'whitespace/braces', 4,
             '{ should almost always be at the end of the previous line')
 
   # An else clause should be on the same line as the preceding closing brace.
-  if Match(r'\s*else\s*', line):
+  if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if Match(r'\s*}\s*$', prevline):
       error(filename, linenum, 'whitespace/newline', 4,
@@ -2990,19 +3771,20 @@ def CheckBraces(filename, clean_lines, linenum, error):
 
   # If braces come on one side of an else, they should be on both.
   # However, we have to worry about "else if" that spans multiple lines!
-  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
-    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
-      # find the ( after the if
-      pos = line.find('else if')
-      pos = line.find('(', pos)
-      if pos > 0:
-        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
-        if endline[endpos:].find('{') == -1:    # must be brace after if
-          error(filename, linenum, 'readability/braces', 5,
-                'If an else has a brace on one side, it should have it on both')
-    else:            # common case: else not followed by a multi-line if
-      error(filename, linenum, 'readability/braces', 5,
-            'If an else has a brace on one side, it should have it on both')
+  if Search(r'else if\s*\(', line):       # could be multi-line if
+    brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
+    # find the ( after the if
+    pos = line.find('else if')
+    pos = line.find('(', pos)
+    if pos > 0:
+      (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+      brace_on_right = endline[endpos:].find('{') != -1
+      if brace_on_left != brace_on_right:    # must be brace after if
+        error(filename, linenum, 'readability/braces', 5,
+              'If an else has a brace on one side, it should have it on both')
+  elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    error(filename, linenum, 'readability/braces', 5,
+          'If an else has a brace on one side, it should have it on both')
 
   # Likewise, an else should never have the else clause on the same line
   if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
@@ -3014,11 +3796,79 @@ def CheckBraces(filename, clean_lines, linenum, error):
     error(filename, linenum, 'whitespace/newline', 4,
           'do/while clauses should not be on a single line')
 
+  # Check single-line if/else bodies. The style guide says 'curly braces are not
+  # required for single-line statements'. We additionally allow multi-line,
+  # single statements, but we reject anything with more than one semicolon in
+  # it. This means that the first semicolon after the if should be at the end of
+  # its line, and the line after that should have an indent level equal to or
+  # lower than the if. We also check for ambiguous if/else nesting without
+  # braces.
+  if_else_match = Search(r'\b(if\s*\(|else\b)', line)
+  if if_else_match and not Match(r'\s*#', line):
+    if_indent = GetIndentLevel(line)
+    endline, endlinenum, endpos = line, linenum, if_else_match.end()
+    if_match = Search(r'\bif\s*\(', line)
+    if if_match:
+      # This could be a multiline if condition, so find the end first.
+      pos = if_match.end() - 1
+      (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos)
+    # Check for an opening brace, either directly after the if or on the next
+    # line. If found, this isn't a single-statement conditional.
+    if (not Match(r'\s*{', endline[endpos:])
+        and not (Match(r'\s*$', endline[endpos:])
+                 and endlinenum < (len(clean_lines.elided) - 1)
+                 and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
+      while (endlinenum < len(clean_lines.elided)
+             and ';' not in clean_lines.elided[endlinenum][endpos:]):
+        endlinenum += 1
+        endpos = 0
+      if endlinenum < len(clean_lines.elided):
+        endline = clean_lines.elided[endlinenum]
+        # We allow a mix of whitespace and closing braces (e.g. for one-liner
+        # methods) and a single \ after the semicolon (for macros)
+        endpos = endline.find(';')
+        if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
+          # Semicolon isn't the last character, there's something trailing.
+          # Output a warning if the semicolon is not contained inside
+          # a lambda expression.
+          if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
+                       endline):
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+        elif endlinenum < len(clean_lines.elided) - 1:
+          # Make sure the next line is dedented
+          next_line = clean_lines.elided[endlinenum + 1]
+          next_indent = GetIndentLevel(next_line)
+          # With ambiguous nested if statements, this will error out on the
+          # if that *doesn't* match the else, regardless of whether it's the
+          # inner one or outer one.
+          if (if_match and Match(r'\s*else\b', next_line)
+              and next_indent != if_indent):
+            error(filename, linenum, 'readability/braces', 4,
+                  'Else clause should be indented at the same level as if. '
+                  'Ambiguous nested if/else chains require braces.')
+          elif next_indent > if_indent:
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+
+
+def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
+  """Looks for redundant trailing semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]
+
   # Block bodies should not be followed by a semicolon.  Due to C++11
   # brace initialization, there are more places where semicolons are
-  # required than not, so we use a whitelist approach to check these
-  # rather than a blacklist.  These are the places where "};" should
-  # be replaced by just "}":
+  # required than not, so we explicitly list the allowed rules rather
+  # than listing the disallowed ones.  These are the places where "};"
+  # should be replaced by just "}":
   # 1. Some flavor of block following closing parenthesis:
   #    for (;;) {};
   #    while (...) {};
@@ -3074,28 +3924,40 @@ def CheckBraces(filename, clean_lines, linenum, error):
     #  - INTERFACE_DEF
     #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
     #
-    # We implement a whitelist of safe macros instead of a blacklist of
+    # We implement a list of safe macros instead of a list of
     # unsafe macros, even though the latter appears less frequently in
     # google code and would have been easier to implement.  This is because
-    # the downside for getting the whitelist wrong means some extra
-    # semicolons, while the downside for getting the blacklist wrong
+    # the downside for getting the allowed checks wrong means some extra
+    # semicolons, while the downside for getting disallowed checks wrong
     # would result in compile errors.
     #
-    # In addition to macros, we also don't want to warn on compound
-    # literals.
+    # In addition to macros, we also don't want to warn on
+    #  - Compound literals
+    #  - Lambdas
+    #  - alignas specifier with anonymous structs
+    #  - decltype
     closing_brace_pos = match.group(1).rfind(')')
     opening_parenthesis = ReverseCloseExpression(
         clean_lines, linenum, closing_brace_pos)
     if opening_parenthesis[2] > -1:
       line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
-      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      macro = Search(r'\b([A-Z_][A-Z0-9_]*)\s*$', line_prefix)
+      func = Match(r'^(.*\])\s*$', line_prefix)
       if ((macro and
            macro.group(1) not in (
                'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
                'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
                'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
+          Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
+          Search(r'\bdecltype$', line_prefix) or
           Search(r'\s+=\s*$', line_prefix)):
         match = None
+    if (match and
+        opening_parenthesis[1] > 1 and
+        Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
+      # Multi-line lambda-expression
+      match = None
 
   else:
     # Try matching cases 2-3.
@@ -3125,6 +3987,14 @@ def CheckBraces(filename, clean_lines, linenum, error):
       # outputting warnings for the matching closing brace, if there are
       # nested blocks with trailing semicolons, we will get the error
       # messages in reversed order.
+
+      # We need to check the line forward for NOLINT
+      raw_lines = clean_lines.raw_lines
+      ParseNolintSuppressions(filename, raw_lines[endlinenum-1], endlinenum-1,
+                              error)
+      ParseNolintSuppressions(filename, raw_lines[endlinenum], endlinenum,
+                              error)
+
       error(filename, endlinenum, 'readability/braces', 4,
             "You don't need a ; after a }")
 
@@ -3148,7 +4018,7 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
   line = clean_lines.elided[linenum]
   matched = Match(r'\s*(for|while|if)\s*\(', line)
   if matched:
-    # Find the end of the conditional expression
+    # Find the end of the conditional expression.
     (end_line, end_linenum, end_pos) = CloseExpression(
         clean_lines, linenum, line.find('('))
 
@@ -3163,6 +4033,98 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
         error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
               'Empty loop bodies should use {} or continue')
 
+    # Check for if statements that have completely empty bodies (no comments)
+    # and no else clauses.
+    if end_pos >= 0 and matched.group(1) == 'if':
+      # Find the position of the opening { for the if statement.
+      # Return without logging an error if it has no brackets.
+      opening_linenum = end_linenum
+      opening_line_fragment = end_line[end_pos:]
+      # Loop until EOF or find anything that's not whitespace or opening {.
+      while not Search(r'^\s*\{', opening_line_fragment):
+        if Search(r'^(?!\s*$)', opening_line_fragment):
+          # Conditional has no brackets.
+          return
+        opening_linenum += 1
+        if opening_linenum == len(clean_lines.elided):
+          # Couldn't find conditional's opening { or any code before EOF.
+          return
+        opening_line_fragment = clean_lines.elided[opening_linenum]
+      # Set opening_line (opening_line_fragment may not be entire opening line).
+      opening_line = clean_lines.elided[opening_linenum]
+
+      # Find the position of the closing }.
+      opening_pos = opening_line_fragment.find('{')
+      if opening_linenum == end_linenum:
+        # We need to make opening_pos relative to the start of the entire line.
+        opening_pos += end_pos
+      (closing_line, closing_linenum, closing_pos) = CloseExpression(
+          clean_lines, opening_linenum, opening_pos)
+      if closing_pos < 0:
+        return
+
+      # Now construct the body of the conditional. This consists of the portion
+      # of the opening line after the {, all lines until the closing line,
+      # and the portion of the closing line before the }.
+      if (clean_lines.raw_lines[opening_linenum] !=
+          CleanseComments(clean_lines.raw_lines[opening_linenum])):
+        # Opening line ends with a comment, so conditional isn't empty.
+        return
+      if closing_linenum > opening_linenum:
+        # Opening line after the {. Ignore comments here since we checked above.
+        body = list(opening_line[opening_pos+1:])
+        # All lines until closing line, excluding closing line, with comments.
+        body.extend(clean_lines.raw_lines[opening_linenum+1:closing_linenum])
+        # Closing line before the }. Won't (and can't) have comments.
+        body.append(clean_lines.elided[closing_linenum][:closing_pos-1])
+        body = '\n'.join(body)
+      else:
+        # If statement has brackets and fits on a single line.
+        body = opening_line[opening_pos+1:closing_pos-1]
+
+      # Check if the body is empty
+      if not _EMPTY_CONDITIONAL_BODY_PATTERN.search(body):
+        return
+      # The body is empty. Now make sure there's not an else clause.
+      current_linenum = closing_linenum
+      current_line_fragment = closing_line[closing_pos:]
+      # Loop until EOF or find anything that's not whitespace or else clause.
+      while Search(r'^\s*$|^(?=\s*else)', current_line_fragment):
+        if Search(r'^(?=\s*else)', current_line_fragment):
+          # Found an else clause, so don't log an error.
+          return
+        current_linenum += 1
+        if current_linenum == len(clean_lines.elided):
+          break
+        current_line_fragment = clean_lines.elided[current_linenum]
+
+      # The body is empty and there's no else clause until EOF or other code.
+      error(filename, end_linenum, 'whitespace/empty_if_body', 4,
+            ('If statement had no body and no else clause'))
+
+
+def FindCheckMacro(line):
+  """Find a replaceable CHECK-like macro.
+
+  Args:
+    line: line to search on.
+  Returns:
+    (macro name, start position), or (None, -1) if no replaceable
+    macro is found.
+  """
+  for macro in _CHECK_MACROS:
+    i = line.find(macro)
+    if i >= 0:
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
+      if not matched:
+        continue
+      return (macro, len(matched.group(1)))
+  return (None, -1)
+
 
 def CheckCheck(filename, clean_lines, linenum, error):
   """Checks the use of CHECK and EXPECT macros.
@@ -3176,24 +4138,8 @@ def CheckCheck(filename, clean_lines, linenum, error):
 
   # Decide the set of replacement macros that should be suggested
   lines = clean_lines.elided
-  check_macro = None
-  start_pos = -1
-  for macro in _CHECK_MACROS:
-    i = lines[linenum].find(macro)
-    if i >= 0:
-      check_macro = macro
-
-      # Find opening parenthesis.  Do a regular expression match here
-      # to make sure that we are matching the expected CHECK macro, as
-      # opposed to some other macro that happens to contain the CHECK
-      # substring.
-      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
-      if not matched:
-        continue
-      start_pos = len(matched.group(1))
-      break
-  if not check_macro or start_pos < 0:
-    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+  (check_macro, start_pos) = FindCheckMacro(lines[linenum])
+  if not check_macro:
     return
 
   # Find end of the boolean expression by matching parentheses
@@ -3201,6 +4147,13 @@ def CheckCheck(filename, clean_lines, linenum, error):
       clean_lines, linenum, start_pos)
   if end_pos < 0:
     return
+
+  # If the check macro is followed by something other than a
+  # semicolon, assume users will log their own custom error messages
+  # and don't suggest any replacements.
+  if not Match(r'\s*;', last_line[end_pos:]):
+    return
+
   if linenum == end_line:
     expression = lines[linenum][start_pos + 1:end_pos - 1]
   else:
@@ -3223,7 +4176,7 @@ def CheckCheck(filename, clean_lines, linenum, error):
       if token == '(':
         # Parenthesized operand
         expression = matched.group(2)
-        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
         if end < 0:
           return  # Unmatched parenthesis
         lhs += '(' + expression[0:end]
@@ -3339,6 +4292,16 @@ def GetLineWidth(line):
       if unicodedata.east_asian_width(uc) in ('W', 'F'):
         width += 2
       elif not unicodedata.combining(uc):
+        # Issue 337
+        # https://mail.python.org/pipermail/python-list/2012-August/628809.html
+        if (sys.version_info.major, sys.version_info.minor) <= (3, 2):
+          # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81
+          is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4
+          # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564
+          is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF
+          if not is_wide_build and is_low_surrogate:
+            width -= 1
+
         width += 1
     return width
   else:
@@ -3358,7 +4321,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3368,6 +4331,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # raw strings,
   raw_lines = clean_lines.lines_without_raw_strings
   line = raw_lines[linenum]
+  prev = raw_lines[linenum - 1] if linenum > 0 else ''
 
   if line.find('\t') != -1:
     error(filename, linenum, 'whitespace/tab', 1,
@@ -3385,23 +4349,33 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # if(match($0, " <<")) complain = 0;
   # if(match(prev, " +for \\(")) complain = 0;
   # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
+  classinfo = nesting_state.InnermostClass()
   initial_spaces = 0
   cleansed_line = clean_lines.elided[linenum]
   while initial_spaces < len(line) and line[initial_spaces] == ' ':
     initial_spaces += 1
-  if line and line[-1].isspace():
-    error(filename, linenum, 'whitespace/end_of_line', 4,
-          'Line ends in whitespace.  Consider deleting these extra spaces.')
-  # There are certain situations we allow one space, notably for section labels
-  elif ((initial_spaces == 1 or initial_spaces == 3) and
-        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+  # There are certain situations we allow one space, notably for
+  # section labels, and also lines containing multi-line raw strings.
+  # We also don't check for lines that look like continuation lines
+  # (of lines ending in double quotes, commas, equals, or angle brackets)
+  # because the rules for how to indent those are non-trivial.
+  if (not Search(r'[",=><] *$', prev) and
+      (initial_spaces == 1 or initial_spaces == 3) and
+      not Match(scope_or_label_pattern, cleansed_line) and
+      not (clean_lines.raw_lines[linenum] != line and
+           Match(r'^\s*""', line))):
     error(filename, linenum, 'whitespace/indent', 3,
           'Weird number of spaces at line-start.  '
           'Are you using a 2-space indent?')
 
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+
   # Check if the line is a header guard.
   is_header_guard = False
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     cppvar = GetHeaderGuardCPPVariable(filename)
     if (line.startswith('#ifndef %s' % cppvar) or
         line.startswith('#define %s' % cppvar) or
@@ -3417,14 +4391,10 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # developers fault.
   if (not line.startswith('#include') and not is_header_guard and
       not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^\s*//\s*[^\s]*$', line) and
       not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
     line_width = GetLineWidth(line)
-    extended_length = int((_line_length * 1.25))
-    if line_width > extended_length:
-      error(filename, linenum, 'whitespace/line_length', 4,
-            'Lines should very rarely be longer than %i characters' %
-            extended_length)
-    elif line_width > _line_length:
+    if line_width > _line_length:
       error(filename, linenum, 'whitespace/line_length', 2,
             'Lines should be <= %i characters long' % _line_length)
 
@@ -3442,9 +4412,14 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
 
   # Some more style checks
   CheckBraces(filename, clean_lines, linenum, error)
+  CheckTrailingSemicolon(filename, clean_lines, linenum, error)
   CheckEmptyBlockBody(filename, clean_lines, linenum, error)
-  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
   CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckOperatorSpacing(filename, clean_lines, linenum, error)
+  CheckParenthesisSpacing(filename, clean_lines, linenum, error)
+  CheckCommaSpacing(filename, clean_lines, linenum, error)
+  CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
   CheckCheck(filename, clean_lines, linenum, error)
   CheckAltTokens(filename, clean_lines, linenum, error)
   classinfo = nesting_state.InnermostClass()
@@ -3452,7 +4427,6 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
     CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
 
 
-_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
 _RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
 # Matches the first component of a filename delimited by -s and _s. That is:
 #  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
@@ -3489,23 +4463,6 @@ def _DropCommonSuffixes(filename):
   return os.path.splitext(filename)[0]
 
 
-def _IsTestFilename(filename):
-  """Determines if the given filename has a suffix that identifies it as a test.
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    True if 'filename' looks like a test, False otherwise.
-  """
-  if (filename.endswith('_test.cc') or
-      filename.endswith('_unittest.cc') or
-      filename.endswith('_regtest.cc')):
-    return True
-  else:
-    return False
-
-
 def _ClassifyInclude(fileinfo, include, is_system):
   """Figures out what kind of header 'include' is.
 
@@ -3581,11 +4538,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
     error: The function to call with any errors found.
   """
   fileinfo = FileInfo(filename)
-
   line = clean_lines.lines[linenum]
 
   # "include" should use the new style "foo/bar.h" instead of just "bar.h"
-  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+  # Only do this check if the included header follows google naming
+  # conventions.  If not, assume that it's a 3rd party API that
+  # requires special include conventions.
+  #
+  # We also make an exception for Lua headers, which follow google
+  # naming convention but not the include convention.
+  match = Match(r'#include\s*"([^/]+\.h)"', line)
+  if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
     error(filename, linenum, 'build/include', 4,
           'Include the directory when naming .h files')
 
@@ -3596,12 +4559,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
   if match:
     include = match.group(2)
     is_system = (match.group(1) == '<')
-    if include in include_state:
+    duplicate_line = include_state.FindHeader(include)
+    if duplicate_line >= 0:
       error(filename, linenum, 'build/include', 4,
             '"%s" already included at %s:%s' %
-            (include, filename, include_state[include]))
-    else:
-      include_state[include] = linenum
+            (include, filename, duplicate_line))
+    elif (include.endswith('.cc') and
+          os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)):
+      error(filename, linenum, 'build/include', 4,
+            'Do not include .cc files from other packages')
+    elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
+      include_state.include_list[-1].append((include, linenum))
 
       # We want to ensure that headers appear in the right order:
       # 1) for foo.cc, foo.h  (preferred location)
@@ -3627,15 +4595,6 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
               'Include "%s" not in alphabetical order' % include)
       include_state.SetLastHeader(canonical_include)
 
-  # Look for any of the stream classes that are part of standard C++.
-  match = _RE_PATTERN_INCLUDE.match(line)
-  if match:
-    include = match.group(2)
-    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
-      # Many unit tests use cout, so we exempt them.
-      if not _IsTestFilename(filename):
-        error(filename, linenum, 'readability/streams', 3,
-              'Streams are highly discouraged.')
 
 
 def _GetTextInside(text, start_pattern):
@@ -3658,7 +4617,7 @@ def _GetTextInside(text, start_pattern):
     The extracted text.
     None if either the opening string or ending punctuation could not be found.
   """
-  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # TODO(unknown): Audit cpplint.py to see what places could be profitably
   # rewritten to use _GetTextInside (and use inferior regexp matching today).
 
   # Give opening punctuations to get the matching close-punctuations.
@@ -3718,6 +4677,9 @@ def _GetTextInside(text, start_pattern):
 _RE_PATTERN_CONST_REF_PARAM = (
     r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
     r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+# Stream types.
+_RE_PATTERN_REF_STREAM_PARAM = (
+    r'(?:.*stream\s*&\s*' + _RE_PATTERN_IDENT + r')')
 
 
 def CheckLanguage(filename, clean_lines, linenum, file_extension,
@@ -3733,7 +4695,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
     include_state: An _IncludeState instance in which the headers are inserted.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3750,129 +4712,23 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
 
   # Reset include state across preprocessor directives.  This is meant
   # to silence warnings for conditional includes.
-  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
-    include_state.ResetSection()
+  match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
+  if match:
+    include_state.ResetSection(match.group(1))
 
   # Make Windows paths like Unix.
   fullname = os.path.abspath(filename).replace('\\', '/')
 
-  # TODO(unknown): figure out if they're using default arguments in fn proto.
+  # Perform other checks now that we are sure that this is not an include line
+  CheckCasts(filename, clean_lines, linenum, error)
+  CheckGlobalStatic(filename, clean_lines, linenum, error)
+  CheckPrintf(filename, clean_lines, linenum, error)
 
-  # Check to see if they're using an conversion function cast.
-  # I just try to capture the most common basic types, though there are more.
-  # Parameterless conversion functions, such as bool(), are allowed as they are
-  # probably a member operator declaration or default constructor.
-  match = Search(
-      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
-      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
-      r'(\([^)].*)', line)
-  if match:
-    matched_new = match.group(1)
-    matched_type = match.group(2)
-    matched_funcptr = match.group(3)
-
-    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
-    # where type may be float(), int(string), etc.  Without context they are
-    # virtually indistinguishable from int(x) casts. Likewise, gMock's
-    # MockCallback takes a template parameter of the form return_type(arg_type),
-    # which looks much like the cast we're trying to detect.
-    #
-    # std::function<> wrapper has a similar problem.
-    #
-    # Return types for function pointers also look like casts if they
-    # don't have an extra space.
-    if (matched_new is None and  # If new operator, then this isn't a cast
-        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-             Search(r'\bMockCallback<.*>', line) or
-             Search(r'\bstd::function<.*>', line)) and
-        not (matched_funcptr and
-             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
-                   matched_funcptr))):
-      # Try a bit harder to catch gmock lines: the only place where
-      # something looks like an old-style cast is where we declare the
-      # return type of the mocked method, and the only time when we
-      # are missing context is if MOCK_METHOD was split across
-      # multiple lines.  The missing MOCK_METHOD is usually one or two
-      # lines back, so scan back one or two lines.
-      #
-      # It's not possible for gmock macros to appear in the first 2
-      # lines, since the class head + section name takes up 2 lines.
-      if (linenum < 2 or
-          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
-                     clean_lines.elided[linenum - 1]) or
-               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
-                     clean_lines.elided[linenum - 2]))):
-        error(filename, linenum, 'readability/casting', 4,
-              'Using deprecated casting style.  '
-              'Use static_cast<%s>(...) instead' %
-              matched_type)
-
-  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                  'static_cast',
-                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
-
-  # This doesn't catch all cases. Consider (const char * const)"hello".
-  #
-  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
-  # compile).
-  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
-    pass
-  else:
-    # Check pointer casts for other than string constants
-    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
-
-  # In addition, we look for people taking the address of a cast.  This
-  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
-  # point where you think.
-  match = Search(
-      r'(?:&\(([^)]+)\)[\w(])|'
-      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
-  if match and match.group(1) != '*':
-    error(filename, linenum, 'runtime/casting', 4,
-          ('Are you taking an address of a cast?  '
-           'This is dangerous: could be a temp var.  '
-           'Take the address before doing the cast, rather than after'))
-
-  # Create an extended_line, which is the concatenation of the current and
-  # next lines, for more effective checking of code that may span more than one
-  # line.
-  if linenum + 1 < clean_lines.NumLines():
-    extended_line = line + clean_lines.elided[linenum + 1]
-  else:
-    extended_line = line
-
-  # Check for people declaring static/global STL strings at the top level.
-  # This is dangerous because the C++ language does not guarantee that
-  # globals with constructors are initialized before the first access.
-  match = Match(
-      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
-      line)
-  # Make sure it's not a function.
-  # Function template specialization looks like: "string foo<Type>(...".
-  # Class template definitions look like: "string Foo<Type>::Method(...".
-  #
-  # Also ignore things that look like operators.  These are matched separately
-  # because operator names cross non-word boundaries.  If we change the pattern
-  # above, we would decrease the accuracy of matching identifiers.
-  if (match and
-      not Search(r'\boperator\W', line) and
-      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
-    error(filename, linenum, 'runtime/string', 4,
-          'For a static/global string constant, use a C style string instead: '
-          '"%schar %s[]".' %
-          (match.group(1), match.group(2)))
-
-  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
-    error(filename, linenum, 'runtime/init', 4,
-          'You seem to be initializing a member variable with itself.')
-
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     # TODO(unknown): check that 1-arg constructors are explicit.
     #                How to tell it's a constructor?
     #                (handled in CheckForNonStandardConstructs for now)
-    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    # TODO(unknown): check that classes declare or disable copy/assign
     #                (level 1 error)
     pass
 
@@ -3888,23 +4744,6 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
       error(filename, linenum, 'runtime/int', 4,
             'Use int16/int64/etc, rather than the C type %s' % match.group(1))
 
-  # When snprintf is used, the second argument shouldn't be a literal.
-  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
-  if match and match.group(2) != '0':
-    # If 2nd arg is zero, snprintf is used to calculate size.
-    error(filename, linenum, 'runtime/printf', 3,
-          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
-          'to snprintf.' % (match.group(1), match.group(2)))
-
-  # Check if some verboten C functions are being used.
-  if Search(r'\bsprintf\b', line):
-    error(filename, linenum, 'runtime/printf', 5,
-          'Never use sprintf.  Use snprintf instead.')
-  match = Search(r'\b(strcpy|strcat)\b', line)
-  if match:
-    error(filename, linenum, 'runtime/printf', 4,
-          'Almost always, snprintf is better than %s' % match.group(1))
-
   # Check if some verboten operator overloading is going on
   # TODO(unknown): catch out-of-line unary operator&:
   #   class X {};
@@ -3924,7 +4763,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
   # Check for potential format string bugs like printf(foo).
   # We constrain the pattern not to pick things like DocidForPrintf(foo).
   # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
-  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # TODO(unknown): Catch the following case. Need to change the calling
   # convention of the whole function to process multiple line to handle it.
   #   printf(
   #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
@@ -3989,37 +4828,188 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
             'Do not use variable-length arrays.  Use an appropriately named '
             "('k' followed by CamelCase) compile-time constant for the size.")
 
-  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
-  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
-  # in the class declaration.
-  match = Match(
-      (r'\s*'
-       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
-       r'\(.*\);$'),
-      line)
-  if match and linenum + 1 < clean_lines.NumLines():
-    next_line = clean_lines.elided[linenum + 1]
-    # We allow some, but not all, declarations of variables to be present
-    # in the statement that defines the class.  The [\w\*,\s]* fragment of
-    # the regular expression below allows users to declare instances of
-    # the class or pointers to instances, but not less common types such
-    # as function pointers or arrays.  It's a tradeoff between allowing
-    # reasonable code and avoiding trying to parse more C++ using regexps.
-    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
-      error(filename, linenum, 'readability/constructors', 3,
-            match.group(1) + ' should be the last thing in the class')
-
   # Check for use of unnamed namespaces in header files.  Registration
   # macros are typically OK, so we allow use of "namespace {" on lines
   # that end with backslashes.
-  if (file_extension == 'h'
+  if (IsHeaderExtension(file_extension)
       and Search(r'\bnamespace\s*{', line)
       and line[-1] != '\\'):
     error(filename, linenum, 'build/namespaces', 4,
           'Do not use unnamed namespaces in header files.  See '
-          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
           ' for more information.')
 
+
+def CheckGlobalStatic(filename, clean_lines, linenum, error):
+  """Check for unsafe global or static objects.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Match two lines at a time to support multiline declarations
+  if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
+    line += clean_lines.elided[linenum + 1].strip()
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access, and
+  # also because globals can be destroyed when some threads are still running.
+  # TODO(unknown): Generalize this to also find static unique_ptr instances.
+  # TODO(unknown): File bugs for clang-tidy to find these.
+  match = Match(
+      r'((?:|static +)(?:|const +))(?::*std::)?string( +const)? +'
+      r'([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+
+  # Remove false positives:
+  # - String pointers (as opposed to values).
+  #    string *pointer
+  #    const string *pointer
+  #    string const *pointer
+  #    string *const pointer
+  #
+  # - Functions and template specializations.
+  #    string Function<Type>(...
+  #    string Class<Type>::Method(...
+  #
+  # - Operators.  These are matched separately because operator names
+  #   cross non-word boundaries, and trying to match both operators
+  #   and functions at the same time would decrease accuracy of
+  #   matching identifiers.
+  #    string Class::operator*()
+  if (match and
+      not Search(r'\bstring\b(\s+const)?\s*[\*\&]\s*(const\s+)?\w', line) and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(4))):
+    if Search(r'\bconst\b', line):
+      error(filename, linenum, 'runtime/string', 4,
+            'For a static/global string constant, use a C style string '
+            'instead: "%schar%s %s[]".' %
+            (match.group(1), match.group(2) or '', match.group(3)))
+    else:
+      error(filename, linenum, 'runtime/string', 4,
+            'Static/global string variables are not permitted.')
+
+  if (Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line) or
+      Search(r'\b([A-Za-z0-9_]*_)\(CHECK_NOTNULL\(\1\)\)', line)):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+
+def CheckPrintf(filename, clean_lines, linenum, error):
+  """Check for printf related issues.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\s*\(', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf. Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\s*\(', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+
+def IsDerivedFunction(clean_lines, linenum):
+  """Check if current line contains an inherited function.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains a function with "override"
+    virt-specifier.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
+    if match:
+      # Look for "override" after the matching closing parenthesis
+      line, _, closing_paren = CloseExpression(
+          clean_lines, i, len(match.group(1)))
+      return (closing_paren >= 0 and
+              Search(r'\boverride\b', line[closing_paren:]))
+  return False
+
+
+def IsOutOfLineMethodDefinition(clean_lines, linenum):
+  """Check if current line contains an out-of-line method definition.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains an out-of-line method definition.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
+      return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None
+  return False
+
+
+def IsInitializerList(clean_lines, linenum):
+  """Check if current line is inside constructor initializer list.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line appears to be inside constructor initializer
+    list, False otherwise.
+  """
+  for i in xrange(linenum, 1, -1):
+    line = clean_lines.elided[i]
+    if i == linenum:
+      remove_function_body = Match(r'^(.*)\{\s*$', line)
+      if remove_function_body:
+        line = remove_function_body.group(1)
+
+    if Search(r'\s:\s*\w+[({]', line):
+      # A lone colon tend to indicate the start of a constructor
+      # initializer list.  It could also be a ternary operator, which
+      # also tend to appear in constructor initializer lists as
+      # opposed to parameter lists.
+      return True
+    if Search(r'\}\s*,\s*$', line):
+      # A closing brace followed by a comma is probably the end of a
+      # brace-initialized member in constructor initializer list.
+      return True
+    if Search(r'[{};]\s*$', line):
+      # Found one of the following:
+      # - A closing brace or semicolon, probably the end of the previous
+      #   function.
+      # - An opening brace, probably the start of current class or namespace.
+      #
+      # Current line is probably not inside an initializer list since
+      # we saw one of those things without seeing the starting colon.
+      return False
+
+  # Got to the beginning of the file without seeing the start of
+  # constructor initializer list.
+  return False
+
+
 def CheckForNonConstReference(filename, clean_lines, linenum,
                               nesting_state, error):
   """Check for non-const references.
@@ -4031,7 +5021,7 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -4040,6 +5030,17 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
   if '&' not in line:
     return
 
+  # If a function is inherited, current function doesn't have much of
+  # a choice, so any non-const references should not be blamed on
+  # derived function.
+  if IsDerivedFunction(clean_lines, linenum):
+    return
+
+  # Don't warn on out-of-line method definitions, as we would warn on the
+  # in-line declaration, if it isn't marked with 'override'.
+  if IsOutOfLineMethodDefinition(clean_lines, linenum):
+    return
+
   # Long type names may be broken across multiple lines, usually in one
   # of these forms:
   #   LongType
@@ -4088,60 +5089,192 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
   #   inside declarators: reference parameter
   # We will exclude the first two cases by checking that we are not inside a
   # function body, including one that was just introduced by a trailing '{'.
-  # TODO(unknwon): Doesn't account for preprocessor directives.
   # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
-  check_params = False
-  if not nesting_state.stack:
-    check_params = True  # top level
-  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
-        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
-    check_params = True  # within class or namespace
-  elif Match(r'.*{\s*$', line):
-    if (len(nesting_state.stack) == 1 or
-        isinstance(nesting_state.stack[-2], _ClassInfo) or
-        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
-      check_params = True  # just opened global/class/namespace block
+  if (nesting_state.previous_stack_top and
+      not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
+           isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
+    # Not at toplevel, not within a class, and not within a namespace
+    return
+
+  # Avoid initializer lists.  We only need to scan back from the
+  # current line for something that starts with ':'.
+  #
+  # We don't need to check the current line, since the '&' would
+  # appear inside the second set of parentheses on the current line as
+  # opposed to the first set.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 10), -1):
+      previous_line = clean_lines.elided[i]
+      if not Search(r'[),]\s*$', previous_line):
+        break
+      if Match(r'^\s*:\s+\S', previous_line):
+        return
+
+  # Avoid preprocessors
+  if Search(r'\\\s*$', line):
+    return
+
+  # Avoid constructor initializer lists
+  if IsInitializerList(clean_lines, linenum):
+    return
+
   # We allow non-const references in a few standard places, like functions
   # called "swap()" or iostream operators like "<<" or ">>".  Do not check
   # those function parameters.
   #
   # We also accept & in static_assert, which looks like a function but
   # it's actually a declaration expression.
-  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+  allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
                            r'operator\s*[<>][<>]|'
                            r'static_assert|COMPILE_ASSERT'
                            r')\s*\(')
-  if Search(whitelisted_functions, line):
-    check_params = False
+  if Search(allowed_functions, line):
+    return
   elif not Search(r'\S+\([^)]*$', line):
-    # Don't see a whitelisted function on this line.  Actually we
+    # Don't see an allowed function on this line.  Actually we
     # didn't see any function name on this line, so this is likely a
     # multi-line parameter list.  Try a bit harder to catch this case.
     for i in xrange(2):
       if (linenum > i and
-          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
-        check_params = False
-        break
+          Search(allowed_functions, clean_lines.elided[linenum - i - 1])):
+        return
 
-  if check_params:
-    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
-    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
-      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
-        error(filename, linenum, 'runtime/references', 2,
-              'Is this a non-const reference? '
-              'If so, make const or use a pointer: ' +
-              ReplaceAll(' *<', '<', parameter))
+  decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+  for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+    if (not Match(_RE_PATTERN_CONST_REF_PARAM, parameter) and
+        not Match(_RE_PATTERN_REF_STREAM_PARAM, parameter)):
+      error(filename, linenum, 'runtime/references', 2,
+            'Is this a non-const reference? '
+            'If so, make const or use a pointer: ' +
+            ReplaceAll(' *<', '<', parameter))
 
 
-def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
-                    error):
+def CheckCasts(filename, clean_lines, linenum, error):
+  """Various cast related checks.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+(?:const\s+)?|\S<\s*(?:const\s+)?)?\b'
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
+  if match and not expecting_function:
+    matched_type = match.group(2)
+
+    # matched_new_or_template is used to silence two false positives:
+    # - New operators
+    # - Template arguments with function types
+    #
+    # For template arguments, we match on types immediately following
+    # an opening bracket without any spaces.  This is a fast way to
+    # silence the common case where the function type is the first
+    # template argument.  False negative with less-than comparison is
+    # avoided because those operators are usually followed by a space.
+    #
+    #   function<double(double)>   // bracket + no space = false positive
+    #   value < double(42)         // bracket + space = true positive
+    matched_new_or_template = match.group(1)
+
+    # Avoid arrays by looking for brackets that come after the closing
+    # parenthesis.
+    if Match(r'\([^()]+\)\s*\[', match.group(3)):
+      return
+
+    # Other things to ignore:
+    # - Function pointers
+    # - Casts to pointer types
+    # - Placement new
+    # - Alias declarations
+    matched_funcptr = match.group(3)
+    if (matched_new_or_template is None and
+        not (matched_funcptr and
+             (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                    matched_funcptr) or
+              matched_funcptr.startswith('(*)'))) and
+        not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
+        not Search(r'new\(\S+\)\s*' + matched_type, line)):
+      error(filename, linenum, 'readability/casting', 4,
+            'Using deprecated casting style.  '
+            'Use static_cast<%s>(...) instead' %
+            matched_type)
+
+  if not expecting_function:
+    CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
+                    r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
+                     r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
+                    r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  #
+  # Some non-identifier character is required before the '&' for the
+  # expression to be recognized as a cast.  These are casts:
+  #   expression = &static_cast<int*>(temporary());
+  #   function(&(int*)(temporary()));
+  #
+  # This is not a cast:
+  #   reference_type&(int* function_param);
+  match = Search(
+      r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
+      r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match:
+    # Try a better error message when the & is bound to something
+    # dereferenced by the casted pointer, as opposed to the casted
+    # pointer itself.
+    parenthesis_error = False
+    match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line)
+    if match:
+      _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1)))
+      if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
+        _, y2, x2 = CloseExpression(clean_lines, y1, x1)
+        if x2 >= 0:
+          extended_line = clean_lines.elided[y2][x2:]
+          if y2 < clean_lines.NumLines() - 1:
+            extended_line += clean_lines.elided[y2 + 1]
+          if Match(r'\s*(?:->|\[)', extended_line):
+            parenthesis_error = True
+
+    if parenthesis_error:
+      error(filename, linenum, 'readability/casting', 4,
+            ('Are you taking an address of something dereferenced '
+             'from a cast?  Wrapping the dereferenced expression in '
+             'parentheses will make the binding more obvious'))
+    else:
+      error(filename, linenum, 'runtime/casting', 4,
+            ('Are you taking an address of a cast?  '
+             'This is dangerous: could be a temp var.  '
+             'Take the address before doing the cast, rather than after'))
+
+
+def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
   """Checks for a C-style cast by looking for the pattern.
 
   Args:
     filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    line: The line of code to check.
-    raw_line: The raw line of code to check, with comments.
     cast_type: The string for the C++ cast to recommend.  This is either
       reinterpret_cast, static_cast, or const_cast, depending.
     pattern: The regular expression used to find C-style casts.
@@ -4151,75 +5284,34 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
     True if an error was emitted.
     False otherwise.
   """
+  line = clean_lines.elided[linenum]
   match = Search(pattern, line)
   if not match:
     return False
 
-  # e.g., sizeof(int)
-  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
-  if sizeof_match:
-    error(filename, linenum, 'runtime/sizeof', 1,
-          'Using sizeof(type).  Use sizeof(varname) instead if possible')
-    return True
+  # Exclude lines with keywords that tend to look like casts
+  context = line[0:match.start(1) - 1]
+  if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
+    return False
+
+  # Try expanding current context to see if we one level of
+  # parentheses inside a macro.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 5), -1):
+      context = clean_lines.elided[i] + context
+  if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
+    return False
 
   # operator++(int) and operator--(int)
-  if (line[0:match.start(1) - 1].endswith(' operator++') or
-      line[0:match.start(1) - 1].endswith(' operator--')):
+  if context.endswith(' operator++') or context.endswith(' operator--'):
     return False
 
-  # A single unnamed argument for a function tends to look like old
-  # style cast.  If we see those, don't issue warnings for deprecated
-  # casts, instead issue warnings for unnamed arguments where
-  # appropriate.
-  #
-  # These are things that we want warnings for, since the style guide
-  # explicitly require all parameters to be named:
-  #   Function(int);
-  #   Function(int) {
-  #   ConstMember(int) const;
-  #   ConstMember(int) const {
-  #   ExceptionMember(int) throw (...);
-  #   ExceptionMember(int) throw (...) {
-  #   PureVirtual(int) = 0;
-  #
-  # These are functions of some sort, where the compiler would be fine
-  # if they had named parameters, but people often omit those
-  # identifiers to reduce clutter:
-  #   (FunctionPointer)(int);
-  #   (FunctionPointer)(int) = value;
-  #   Function((function_pointer_arg)(int))
-  #   <TemplateArgument(int)>;
-  #   <(FunctionPointerTemplateArgument)(int)>;
+  # A single unnamed argument for a function tends to look like old style cast.
+  # If we see those, don't issue warnings for deprecated casts.
   remainder = line[match.end(0):]
-  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
-    # Looks like an unnamed parameter.
-
-    # Don't warn on any kind of template arguments.
-    if Match(r'^\s*>', remainder):
-      return False
-
-    # Don't warn on assignments to function pointers, but keep warnings for
-    # unnamed parameters to pure virtual functions.  Note that this pattern
-    # will also pass on assignments of "0" to function pointers, but the
-    # preferred values for those would be "nullptr" or "NULL".
-    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
-    if matched_zero and matched_zero.group(1) != '0':
-      return False
-
-    # Don't warn on function pointer declarations.  For this we need
-    # to check what came before the "(type)" string.
-    if Match(r'.*\)\s*$', line[0:match.start(0)]):
-      return False
-
-    # Don't warn if the parameter is named with block comments, e.g.:
-    #  Function(int /*unused_param*/);
-    if '/*' in raw_line:
-      return False
-
-    # Passed all filters, issue warning here.
-    error(filename, linenum, 'readability/function', 3,
-          'All parameters should be named in a function')
-    return True
+  if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
+           remainder):
+    return False
 
   # At this point, all that should be left is actual casts.
   error(filename, linenum, 'readability/casting', 4,
@@ -4229,6 +5321,28 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
   return True
 
 
+def ExpectingFunctionArgs(clean_lines, linenum):
+  """Checks whether where function type arguments are expected.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+
+  Returns:
+    True if the line at 'linenum' is inside something that expects arguments
+    of function types.
+  """
+  line = clean_lines.elided[linenum]
+  return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+          (linenum >= 2 and
+           (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                  clean_lines.elided[linenum - 1]) or
+            Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                  clean_lines.elided[linenum - 2]) or
+            Search(r'\bstd::m?function\s*\<\s*$',
+                   clean_lines.elided[linenum - 1]))))
+
+
 _HEADERS_CONTAINING_TEMPLATES = (
     ('<deque>', ('deque',)),
     ('<functional>', ('unary_function', 'binary_function',
@@ -4251,11 +5365,15 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
     ('<limits>', ('numeric_limits',)),
     ('<list>', ('list',)),
     ('<map>', ('map', 'multimap',)),
-    ('<memory>', ('allocator',)),
+    ('<memory>', ('allocator', 'make_shared', 'make_unique', 'shared_ptr',
+                  'unique_ptr', 'weak_ptr')),
     ('<queue>', ('queue', 'priority_queue',)),
     ('<set>', ('set', 'multiset',)),
     ('<stack>', ('stack',)),
     ('<string>', ('char_traits', 'basic_string',)),
+    ('<tuple>', ('tuple',)),
+    ('<unordered_map>', ('unordered_map', 'unordered_multimap')),
+    ('<unordered_set>', ('unordered_set', 'unordered_multiset')),
     ('<utility>', ('pair',)),
     ('<vector>', ('vector',)),
 
@@ -4266,18 +5384,26 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
     ('<slist>', ('slist',)),
     )
 
-_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+_HEADERS_MAYBE_TEMPLATES = (
+    ('<algorithm>', ('copy', 'max', 'min', 'min_element', 'sort',
+                     'transform',
+                    )),
+    ('<utility>', ('forward', 'make_pair', 'move', 'swap')),
+    )
 
-_re_pattern_algorithm_header = []
-for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
-                  'transform'):
-  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
-  # type::max().
-  _re_pattern_algorithm_header.append(
-      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
-       _template,
-       '<algorithm>'))
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
 
+_re_pattern_headers_maybe_templates = []
+for _header, _templates in _HEADERS_MAYBE_TEMPLATES:
+  for _template in _templates:
+    # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+    # type::max().
+    _re_pattern_headers_maybe_templates.append(
+        (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+            _template,
+            _header))
+
+# Other scripts may reach in and modify this pattern.
 _re_pattern_templates = []
 for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
   for _template in _templates:
@@ -4317,13 +5443,13 @@ def FilesBelongToSameModule(filename_cc, filename_h):
     string: the additional prefix needed to open the header file.
   """
 
-  if not filename_cc.endswith('.cc'):
+  fileinfo = FileInfo(filename_cc)
+  if not fileinfo.IsSource():
     return (False, '')
-  filename_cc = filename_cc[:-len('.cc')]
-  if filename_cc.endswith('_unittest'):
-    filename_cc = filename_cc[:-len('_unittest')]
-  elif filename_cc.endswith('_test'):
-    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc[:-len(fileinfo.Extension())]
+  matched_test_suffix = Search(_TEST_FILE_SUFFIX, fileinfo.BaseName())
+  if matched_test_suffix:
+    filename_cc = filename_cc[:-len(matched_test_suffix.group(1))]
   filename_cc = filename_cc.replace('/public/', '/')
   filename_cc = filename_cc.replace('/internal/', '/')
 
@@ -4342,16 +5468,16 @@ def FilesBelongToSameModule(filename_cc, filename_h):
   return files_belong_to_same_module, common_path
 
 
-def UpdateIncludeState(filename, include_state, io=codecs):
-  """Fill up the include_state with new includes found from the file.
+def UpdateIncludeState(filename, include_dict, io=codecs):
+  """Fill up the include_dict with new includes found from the file.
 
   Args:
     filename: the name of the header to read.
-    include_state: an _IncludeState instance in which the headers are inserted.
+    include_dict: a dictionary in which the headers are inserted.
     io: The io factory to use to read the file. Provided for testability.
 
   Returns:
-    True if a header was succesfully added. False otherwise.
+    True if a header was successfully added. False otherwise.
   """
   headerfile = None
   try:
@@ -4365,9 +5491,7 @@ def UpdateIncludeState(filename, include_state, io=codecs):
     match = _RE_PATTERN_INCLUDE.search(clean_line)
     if match:
       include = match.group(2)
-      # The value formatting is cute, but not really used right now.
-      # What matters here is that the key is in include_state.
-      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+      include_dict.setdefault(include, linenum)
   return True
 
 
@@ -4406,7 +5530,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
       if prefix.endswith('std::') or not prefix.endswith('::'):
         required['<string>'] = (linenum, 'string')
 
-    for pattern, template, header in _re_pattern_algorithm_header:
+    for pattern, template, header in _re_pattern_headers_maybe_templates:
       if pattern.search(line):
         required[header] = (linenum, template)
 
@@ -4415,15 +5539,21 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
       continue
 
     for pattern, template, header in _re_pattern_templates:
-      if pattern.search(line):
-        required[header] = (linenum, template)
+      matched = pattern.search(line)
+      if matched:
+        # Don't warn about IWYU in non-STL namespaces:
+        # (We check only the first match per line; good enough.)
+        prefix = line[:matched.start()]
+        if prefix.endswith('std::') or not prefix.endswith('::'):
+          required[header] = (linenum, template)
 
   # The policy is that if you #include something in foo.h you don't need to
   # include it again in foo.cc. Here, we will look at possible includes.
-  # Let's copy the include_state so it is only messed up within this function.
-  include_state = include_state.copy()
+  # Let's flatten the include_state include_list and copy it into a dictionary.
+  include_dict = dict([item for sublist in include_state.include_list
+                       for item in sublist])
 
-  # Did we find the header for this file (if any) and succesfully load it?
+  # Did we find the header for this file (if any) and successfully load it?
   header_found = False
 
   # Use the absolute path so that matching works properly.
@@ -4438,13 +5568,13 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # instead of 'foo_flymake.h'
   abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
 
-  # include_state is modified during iteration, so we iterate over a copy of
+  # include_dict is modified during iteration, so we iterate over a copy of
   # the keys.
-  header_keys = include_state.keys()
+  header_keys = include_dict.keys()
   for header in header_keys:
     (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
     fullpath = common_path + header
-    if same_module and UpdateIncludeState(fullpath, include_state, io):
+    if same_module and UpdateIncludeState(fullpath, include_dict, io):
       header_found = True
 
   # If we can't find the header file for a .cc, assume it's because we don't
@@ -4458,7 +5588,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # All the lines have been processed, report the errors found.
   for required_header_unstripped in required:
     template = required[required_header_unstripped][1]
-    if required_header_unstripped.strip('<>"') not in include_state:
+    if required_header_unstripped.strip('<>"') not in include_dict:
       error(filename, required[required_header_unstripped][0],
             'build/include_what_you_use', 4,
             'Add #include ' + required_header_unstripped + ' for ' + template)
@@ -4470,7 +5600,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
 def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
   """Check that make_pair's template arguments are deduced.
 
-  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are
   specified explicitly, and such use isn't intended in any case.
 
   Args:
@@ -4488,6 +5618,165 @@ def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
           ' OR use pair directly OR if appropriate, construct a pair directly')
 
 
+def CheckRedundantVirtual(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "virtual" function-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for "virtual" on current line.
+  line = clean_lines.elided[linenum]
+  virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
+  if not virtual: return
+
+  # Ignore "virtual" keywords that are near access-specifiers.  These
+  # are only used in class base-specifier and do not apply to member
+  # functions.
+  if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
+      Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
+    return
+
+  # Ignore the "virtual" keyword from virtual base classes.  Usually
+  # there is a column on the same line in these cases (virtual base
+  # classes are rare in google3 because multiple inheritance is rare).
+  if Match(r'^.*[^:]:[^:].*$', line): return
+
+  # Look for the next opening parenthesis.  This is the start of the
+  # parameter list (possibly on the next line shortly after virtual).
+  # TODO(unknown): doesn't work if there are virtual functions with
+  # decltype() or other things that use parentheses, but csearch suggests
+  # that this is rare.
+  end_col = -1
+  end_line = -1
+  start_col = len(virtual.group(2))
+  for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[start_line][start_col:]
+    parameter_list = Match(r'^([^(]*)\(', line)
+    if parameter_list:
+      # Match parentheses to find the end of the parameter list
+      (_, end_line, end_col) = CloseExpression(
+          clean_lines, start_line, start_col + len(parameter_list.group(1)))
+      break
+    start_col = 0
+
+  if end_col < 0:
+    return  # Couldn't find end of parameter list, give up
+
+  # Look for "override" or "final" after the parameter list
+  # (possibly on the next few lines).
+  for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[i][end_col:]
+    match = Search(r'\b(override|final)\b', line)
+    if match:
+      error(filename, linenum, 'readability/inheritance', 4,
+            ('"virtual" is redundant since function is '
+             'already declared as "%s"' % match.group(1)))
+
+    # Set end_col to check whole lines after we are done with the
+    # first line.
+    end_col = 0
+    if Search(r'[^\w]\s*$', line):
+      break
+
+
+def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "override" or "final" virt-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for closing parenthesis nearby.  We need one to confirm where
+  # the declarator ends and where the virt-specifier starts to avoid
+  # false positives.
+  line = clean_lines.elided[linenum]
+  declarator_end = line.rfind(')')
+  if declarator_end >= 0:
+    fragment = line[declarator_end:]
+  else:
+    if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
+      fragment = line
+    else:
+      return
+
+  # Check that at most one of "override" or "final" is present, not both
+  if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
+    error(filename, linenum, 'readability/inheritance', 4,
+          ('"override" is redundant since function is '
+           'already declared as "final"'))
+
+
+
+
+# Returns true if we are at a new block, and it is directly
+# inside of a namespace.
+def IsBlockInNameSpace(nesting_state, is_forward_declaration):
+  """Checks that the new block is directly in a namespace.
+
+  Args:
+    nesting_state: The _NestingState object that contains info about our state.
+    is_forward_declaration: If the class is a forward declared class.
+  Returns:
+    Whether or not the new block is directly in a namespace.
+  """
+  if is_forward_declaration:
+    if len(nesting_state.stack) >= 1 and (
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+      return True
+    else:
+      return False
+
+  return (len(nesting_state.stack) > 1 and
+          nesting_state.stack[-1].check_namespace_indentation and
+          isinstance(nesting_state.stack[-2], _NamespaceInfo))
+
+
+def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                    raw_lines_no_comments, linenum):
+  """This method determines if we should apply our namespace indentation check.
+
+  Args:
+    nesting_state: The current nesting state.
+    is_namespace_indent_item: If we just put a new class on the stack, True.
+      If the top of the stack is not a class, or we did not recently
+      add the class, False.
+    raw_lines_no_comments: The lines without the comments.
+    linenum: The current line number we are processing.
+
+  Returns:
+    True if we should apply our namespace indentation check. Currently, it
+    only works for classes and namespaces inside of a namespace.
+  """
+
+  is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
+                                                     linenum)
+
+  if not (is_namespace_indent_item or is_forward_declaration):
+    return False
+
+  # If we are in a macro, we do not want to check the namespace indentation.
+  if IsMacroDefinition(raw_lines_no_comments, linenum):
+    return False
+
+  return IsBlockInNameSpace(nesting_state, is_forward_declaration)
+
+
+# Call this method if the line is directly inside of a namespace.
+# If the line above is blank (excluding comments) or the start of
+# an inner namespace, it cannot be indented.
+def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum,
+                                    error):
+  line = raw_lines_no_comments[linenum]
+  if Match(r'^\s+', line):
+    error(filename, linenum, 'runtime/indentation_namespace', 4,
+          'Do not indent within a namespace')
+
+
 def ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions=[]):
@@ -4501,7 +5790,7 @@ def ProcessLine(filename, file_extension, clean_lines, line,
     line: Number of line being processed.
     include_state: An _IncludeState instance in which the headers are inserted.
     function_state: A _FunctionState instance which counts function lines, etc.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -4512,8 +5801,9 @@ def ProcessLine(filename, file_extension, clean_lines, line,
   raw_lines = clean_lines.raw_lines
   ParseNolintSuppressions(filename, raw_lines[line], line, error)
   nesting_state.Update(filename, clean_lines, line, error)
-  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
-    return
+  CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                               error)
+  if nesting_state.InAsmBlock(): return
   CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
   CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
   CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
@@ -4526,9 +5816,82 @@ def ProcessLine(filename, file_extension, clean_lines, line,
   CheckPosixThreading(filename, clean_lines, line, error)
   CheckInvalidIncrement(filename, clean_lines, line, error)
   CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  CheckRedundantVirtual(filename, clean_lines, line, error)
+  CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
   for check_fn in extra_check_functions:
     check_fn(filename, clean_lines, line, error)
 
+def FlagCxx11Features(filename, clean_lines, linenum, error):
+  """Flag those c++11 features that we only allow in certain places.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++ TR1 headers.
+  if include and include.group(1).startswith('tr1/'):
+    error(filename, linenum, 'build/c++tr1', 5,
+          ('C++ TR1 headers such as <%s> are unapproved.') % include.group(1))
+
+  # Flag unapproved C++11 headers.
+  if include and include.group(1) in ('cfenv',
+                                      'condition_variable',
+                                      'fenv.h',
+                                      'future',
+                                      'mutex',
+                                      'thread',
+                                      'chrono',
+                                      'ratio',
+                                      'regex',
+                                      'system_error',
+                                     ):
+    error(filename, linenum, 'build/c++11', 5,
+          ('<%s> is an unapproved C++11 header.') % include.group(1))
+
+  # The only place where we need to worry about C++11 keywords and library
+  # features in preprocessor directives is in macro definitions.
+  if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
+
+  # These are classes and free functions.  The classes are always
+  # mentioned as std::*, but we only catch the free functions if
+  # they're not found by ADL.  They're alphabetical by header.
+  for top_name in (
+      # type_traits
+      'alignment_of',
+      'aligned_union',
+      ):
+    if Search(r'\bstd::%s\b' % top_name, line):
+      error(filename, linenum, 'build/c++11', 5,
+            ('std::%s is an unapproved C++11 class or function.  Send c-style '
+             'an example of where it would make your code more readable, and '
+             'they may let you use it.') % top_name)
+
+
+def FlagCxx14Features(filename, clean_lines, linenum, error):
+  """Flag those C++14 features that we restrict.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++14 headers.
+  if include and include.group(1) in ('scoped_allocator', 'shared_mutex'):
+    error(filename, linenum, 'build/c++14', 5,
+          ('<%s> is an unapproved C++14 header.') % include.group(1))
+
+
 def ProcessFileData(filename, file_extension, lines, error,
                     extra_check_functions=[]):
   """Performs lint checks and reports any errors to the given error function.
@@ -4549,31 +5912,122 @@ def ProcessFileData(filename, file_extension, lines, error,
 
   include_state = _IncludeState()
   function_state = _FunctionState()
-  nesting_state = _NestingState()
+  nesting_state = NestingState()
 
   ResetNolintSuppressions()
 
   CheckForCopyright(filename, lines, error)
-
-  if file_extension == 'h':
-    CheckForHeaderGuard(filename, lines, error)
-
+  ProcessGlobalSuppresions(lines)
   RemoveMultiLineComments(filename, lines, error)
   clean_lines = CleansedLines(lines)
+
+  if IsHeaderExtension(file_extension):
+    CheckForHeaderGuard(filename, clean_lines, error)
+
   for line in xrange(clean_lines.NumLines()):
     ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions)
+    FlagCxx11Features(filename, clean_lines, line, error)
   nesting_state.CheckCompletedBlocks(filename, error)
 
   CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
 
+  # Check that the .cc file has included its header if it exists.
+  if _IsSourceExtension(file_extension):
+    CheckHeaderFileIncluded(filename, include_state, error)
+
   # We check here rather than inside ProcessLine so that we see raw
   # lines rather than "cleaned" lines.
   CheckForBadCharacters(filename, lines, error)
 
   CheckForNewlineAtEOF(filename, lines, error)
 
+def ProcessConfigOverrides(filename):
+  """ Loads the configuration files and processes the config overrides.
+
+  Args:
+    filename: The name of the file being processed by the linter.
+
+  Returns:
+    False if the current |filename| should not be processed further.
+  """
+
+  abs_filename = os.path.abspath(filename)
+  cfg_filters = []
+  keep_looking = True
+  while keep_looking:
+    abs_path, base_name = os.path.split(abs_filename)
+    if not base_name:
+      break  # Reached the root directory.
+
+    cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
+    abs_filename = abs_path
+    if not os.path.isfile(cfg_file):
+      continue
+
+    try:
+      with open(cfg_file) as file_handle:
+        for line in file_handle:
+          line, _, _ = line.partition('#')  # Remove comments.
+          if not line.strip():
+            continue
+
+          name, _, val = line.partition('=')
+          name = name.strip()
+          val = val.strip()
+          if name == 'set noparent':
+            keep_looking = False
+          elif name == 'filter':
+            cfg_filters.append(val)
+          elif name == 'exclude_files':
+            # When matching exclude_files pattern, use the base_name of
+            # the current file name or the directory name we are processing.
+            # For example, if we are checking for lint errors in /foo/bar/baz.cc
+            # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
+            # file's "exclude_files" filter is meant to be checked against "bar"
+            # and not "baz" nor "bar/baz.cc".
+            if base_name:
+              pattern = re.compile(val)
+              if pattern.match(base_name):
+                if _cpplint_state.quiet:
+                  # Suppress "Ignoring file" warning when using --quiet.
+                  return False
+                sys.stderr.write('Ignoring "%s": file excluded by "%s". '
+                                 'File path component "%s" matches '
+                                 'pattern "%s"\n' %
+                                 (filename, cfg_file, base_name, val))
+                return False
+          elif name == 'linelength':
+            global _line_length
+            try:
+                _line_length = int(val)
+            except ValueError:
+                sys.stderr.write('Line length must be numeric.')
+          elif name == 'root':
+            global _root
+            # root directories are specified relative to CPPLINT.cfg dir.
+            _root = os.path.join(os.path.dirname(cfg_file), val)
+          elif name == 'headers':
+            ProcessHppHeadersOption(val)
+          else:
+            sys.stderr.write(
+                'Invalid configuration option (%s) in file %s\n' %
+                (name, cfg_file))
+
+    except IOError:
+      sys.stderr.write(
+          "Skipping config file '%s': Can't open for reading\n" % cfg_file)
+      keep_looking = False
+
+  # Apply all the accumulated filters in reverse order (top-level directory
+  # config options having the least priority).
+  for filter in reversed(cfg_filters):
+     _AddFilters(filter)
+
+  return True
+
+
 def ProcessFile(filename, vlevel, extra_check_functions=[]):
   """Does google-lint on a single file.
 
@@ -4589,7 +6043,15 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
   """
 
   _SetVerboseLevel(vlevel)
+  _BackupFilters()
+  old_errors = _cpplint_state.error_count
+
+  if not ProcessConfigOverrides(filename):
+    _RestoreFilters()
+    return
 
+  lf_lines = []
+  crlf_lines = []
   try:
     # Support the UNIX convention of using "-" for stdin.  Note that
     # we are not opening the file with universal newline support
@@ -4597,10 +6059,7 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
     # contain trailing '\r' characters if we are reading a file that
     # has CRLF endings.
     # If after the split a trailing '\r' is present, it is removed
-    # below. If it is not expected to be present (i.e. os.linesep !=
-    # '\r\n' as in Windows), a warning is issued below if this file
-    # is processed.
-
+    # below.
     if filename == '-':
       lines = codecs.StreamReaderWriter(sys.stdin,
                                         codecs.getreader('utf8'),
@@ -4609,16 +6068,19 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
     else:
       lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
 
-    carriage_return_found = False
     # Remove trailing '\r'.
-    for linenum in range(len(lines)):
+    # The -1 accounts for the extra trailing blank line we get from split()
+    for linenum in range(len(lines) - 1):
       if lines[linenum].endswith('\r'):
         lines[linenum] = lines[linenum].rstrip('\r')
-        carriage_return_found = True
+        crlf_lines.append(linenum + 1)
+      else:
+        lf_lines.append(linenum + 1)
 
   except IOError:
     sys.stderr.write(
         "Skipping input '%s': Can't open for reading\n" % filename)
+    _RestoreFilters()
     return
 
   # Note, if no dot is found, this will give the entire filename as the ext.
@@ -4632,14 +6094,30 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
   else:
     ProcessFileData(filename, file_extension, lines, Error,
                     extra_check_functions)
-    if carriage_return_found and os.linesep != '\r\n':
-      # Use 0 for linenum since outputting only one error for potentially
-      # several lines.
-      Error(filename, 0, 'whitespace/newline', 1,
-            'One or more unexpected \\r (^M) found;'
-            'better to use only a \\n')
 
-  sys.stderr.write('Done processing %s\n' % filename)
+    # If end-of-line sequences are a mix of LF and CR-LF, issue
+    # warnings on the lines with CR.
+    #
+    # Don't issue any warnings if all lines are uniformly LF or CR-LF,
+    # since critique can handle these just fine, and the style guide
+    # doesn't dictate a particular end of line sequence.
+    #
+    # We can't depend on os.linesep to determine what the desired
+    # end-of-line sequence should be, since that will return the
+    # server-side end-of-line sequence.
+    if lf_lines and crlf_lines:
+      # Warn on every line with CR.  An alternative approach might be to
+      # check whether the file is mostly CRLF or just LF, and warn on the
+      # minority, we bias toward LF here since most tools prefer LF.
+      for linenum in crlf_lines:
+        Error(filename, linenum, 'whitespace/newline', 1,
+              'Unexpected \\r (^M) found; better to use only \\n')
+
+  # Suppress printing anything if --quiet was passed unless the error
+  # count has increased after processing this file.
+  if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count:
+    sys.stdout.write('Done processing %s\n' % filename)
+  _RestoreFilters()
 
 
 def PrintUsage(message):
@@ -4681,13 +6159,16 @@ def ParseArguments(args):
                                                  'filter=',
                                                  'root=',
                                                  'linelength=',
-                                                 'extensions='])
+                                                 'extensions=',
+                                                 'headers=',
+                                                 'quiet'])
   except getopt.GetoptError:
     PrintUsage('Invalid arguments.')
 
   verbosity = _VerboseLevel()
   output_format = _OutputFormat()
   filters = ''
+  quiet = _Quiet()
   counting_style = ''
 
   for (opt, val) in opts:
@@ -4697,6 +6178,8 @@ def ParseArguments(args):
       if val not in ('emacs', 'vs7', 'eclipse'):
         PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
       output_format = val
+    elif opt == '--quiet':
+      quiet = True
     elif opt == '--verbose':
       verbosity = int(val)
     elif opt == '--filter':
@@ -4721,12 +6204,15 @@ def ParseArguments(args):
       try:
           _valid_extensions = set(val.split(','))
       except ValueError:
-          PrintUsage('Extensions must be comma seperated list.')
+          PrintUsage('Extensions must be comma separated list.')
+    elif opt == '--headers':
+      ProcessHppHeadersOption(val)
 
   if not filenames:
     PrintUsage('No files were specified.')
 
   _SetOutputFormat(output_format)
+  _SetQuiet(quiet)
   _SetVerboseLevel(verbosity)
   _SetFilters(filters)
   _SetCountingStyle(counting_style)
@@ -4747,7 +6233,9 @@ def main():
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:
     ProcessFile(filename, _cpplint_state.verbose_level)
-  _cpplint_state.PrintErrorCounts()
+  # If --quiet is passed, suppress printing error count unless there are errors.
+  if not _cpplint_state.quiet or _cpplint_state.error_count > 0:
+    _cpplint_state.PrintErrorCounts()
 
   sys.exit(_cpplint_state.error_count > 0)
 
diff --git a/tools/diff.py b/tools/diff.py
index a96c7db851..860a6b051b 100644
--- a/tools/diff.py
+++ b/tools/diff.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
diff --git a/tools/intersect-diffs.py b/tools/intersect-diffs.py
index 4dbafa90b7..590e687b47 100755
--- a/tools/intersect-diffs.py
+++ b/tools/intersect-diffs.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
@@ -69,7 +69,7 @@ def main():
                 break
 
     if out_hunks:
-        print FormatDiffHunks(out_hunks)
+        print(FormatDiffHunks(out_hunks))
         sys.exit(1)
 
 if __name__ == "__main__":
diff --git a/tools/lint-hunks.py b/tools/lint-hunks.py
index 6e25d93624..0a94afebb9 100755
--- a/tools/lint-hunks.py
+++ b/tools/lint-hunks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
@@ -10,7 +10,7 @@
 """Performs style checking on each diff hunk."""
 import getopt
 import os
-import StringIO
+import io
 import subprocess
 import sys
 
@@ -63,21 +63,21 @@ def main(argv=None):
     try:
         try:
             opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS)
-        except getopt.error, msg:
+        except getopt.error as msg:
             raise Usage(msg)
 
         # process options
         for o, _ in opts:
             if o in ("-h", "--help"):
-                print __doc__
+                print(__doc__)
                 sys.exit(0)
 
         if args and len(args) > 1:
-            print __doc__
+            print(__doc__)
             sys.exit(0)
 
         # Find the fully qualified path to the root of the tree
-        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE)
+        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE, text=True)
         tl = tl.communicate()[0].strip()
 
         # See if we're working on the index or not.
@@ -93,9 +93,9 @@ def main(argv=None):
 
         # Get a list of all affected lines
         file_affected_line_map = {}
-        p = Subprocess(diff_cmd, stdout=subprocess.PIPE)
+        p = Subprocess(diff_cmd, stdout=subprocess.PIPE, text=True)
         stdout = p.communicate()[0]
-        for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)):
+        for hunk in diff.ParseDiffHunks(io.StringIO(stdout)):
             filename = hunk.right.filename[2:]
             if filename not in file_affected_line_map:
                 file_affected_line_map[filename] = set()
@@ -103,21 +103,25 @@ def main(argv=None):
 
         # Run each affected file through cpplint
         lint_failed = False
-        for filename, affected_lines in file_affected_line_map.iteritems():
+        for filename, affected_lines in file_affected_line_map.items():
             if filename.split(".")[-1] not in ("c", "h", "cc"):
                 continue
+            if filename.startswith("third_party"):
+                continue
 
             if args:
                 # File contents come from git
                 show_cmd = SHOW_CMD + [args[0] + ":" + filename]
-                show = Subprocess(show_cmd, stdout=subprocess.PIPE)
+                show = Subprocess(show_cmd, stdout=subprocess.PIPE, text=True)
                 lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
-                                  stdin=show.stdout, stderr=subprocess.PIPE)
+                                  stdin=show.stdout, stderr=subprocess.PIPE,
+                                  text=True)
                 lint_out = lint.communicate()[1]
             else:
                 # File contents come from the working tree
                 lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
-                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE,
+                                  text=True)
                 stdin = open(os.path.join(tl, filename)).read()
                 lint_out = lint.communicate(stdin)[1]
 
@@ -127,17 +131,17 @@ def main(argv=None):
                     continue
                 warning_line_num = int(fields[1])
                 if warning_line_num in affected_lines:
-                    print "%s:%d:%s"%(filename, warning_line_num,
-                                      ":".join(fields[2:]))
+                    print("%s:%d:%s"%(filename, warning_line_num,
+                                      ":".join(fields[2:])))
                     lint_failed = True
 
         # Set exit code if any relevant lint errors seen
         if lint_failed:
             return 1
 
-    except Usage, err:
-        print >>sys.stderr, err
-        print >>sys.stderr, "for help use --help"
+    except Usage as err:
+        print(err, file=sys.stderr)
+        print("for help use --help", file=sys.stderr)
         return 2
 
 if __name__ == "__main__":
diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c
index ff4634ade4..8fba814621 100644
--- a/tools/tiny_ssim.c
+++ b/tools/tiny_ssim.c
@@ -425,20 +425,24 @@ int main(int argc, char *argv[]) {
       break;
     }
 #if CONFIG_VP9_HIGHBITDEPTH
-#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)                            \
-  if (bit_depth < 9) {                                                         \
-    ssim = ssim2(buf0, buf1, w, w, w, h);                                      \
-    psnr = calc_plane_error(buf0, w, buf1, w, w, h);                           \
-  } else {                                                                     \
-    ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \
-                        w, w, h, bit_depth);                                   \
-    psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                       \
-                              CAST_TO_SHORTPTR(buf1), w, w, h);                \
-  }
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)                           \
+  do {                                                                        \
+    if (bit_depth < 9) {                                                      \
+      ssim = ssim2(buf0, buf1, w, w, w, h);                                   \
+      psnr = calc_plane_error(buf0, w, buf1, w, w, h);                        \
+    } else {                                                                  \
+      ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), \
+                          w, w, w, h, bit_depth);                             \
+      psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                    \
+                                CAST_TO_SHORTPTR(buf1), w, w, h);             \
+    }                                                                         \
+  } while (0)
 #else
-#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
-  ssim = ssim2(buf0, buf1, w, w, w, h);             \
-  psnr = calc_plane_error(buf0, w, buf1, w, w, h);
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)  \
+  do {                                               \
+    ssim = ssim2(buf0, buf1, w, w, w, h);            \
+    psnr = calc_plane_error(buf0, w, buf1, w, w, h); \
+  } while (0)
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (n_frames == allocated_frames) {
@@ -449,6 +453,10 @@ int main(int argc, char *argv[]) {
       psnry = realloc(psnry, allocated_frames * sizeof(*psnry));
       psnru = realloc(psnru, allocated_frames * sizeof(*psnru));
       psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv));
+      if (!(ssimy && ssimu && ssimv && psnry && psnru && psnrv)) {
+        fprintf(stderr, "Error allocating SSIM/PSNR data.\n");
+        exit(EXIT_FAILURE);
+      }
     }
     psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h);
     psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2,
diff --git a/tools/wrap-commit-msg.py b/tools/wrap-commit-msg.py
index d5b4b046b1..ba3fa58732 100755
--- a/tools/wrap-commit-msg.py
+++ b/tools/wrap-commit-msg.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
diff --git a/tools_common.c b/tools_common.c
index cbecfbb419..5c13781513 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -24,6 +24,8 @@
 #include "vpx/vp8dx.h"
 #endif
 
+#include "vpx/vpx_codec.h"
+
 #if defined(_WIN32) || defined(__OS2__)
 #include <io.h>
 #include <fcntl.h>
@@ -77,8 +79,8 @@ void warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
 void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
   const char *detail = vpx_codec_error_detail(ctx);
 
-  printf("%s: %s\n", s, vpx_codec_error(ctx));
-  if (detail) printf("    %s\n", detail);
+  fprintf(stderr, "%s: %s\n", s, vpx_codec_error(ctx));
+  if (detail) fprintf(stderr, "    %s\n", detail);
   exit(EXIT_FAILURE);
 }
 
@@ -375,7 +377,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
     case VPX_IMG_FMT_I44016: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -411,7 +413,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -452,7 +454,7 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -487,7 +489,7 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
     case VPX_IMG_FMT_I44016: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -521,7 +523,7 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
diff --git a/tools_common.h b/tools_common.h
index 4526d9f165..e2942d04b8 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -32,7 +32,12 @@ typedef int64_t FileOffset;
 #define fseeko fseeko64
 #define ftello ftello64
 typedef off64_t FileOffset;
-#elif CONFIG_OS_SUPPORT
+#elif CONFIG_OS_SUPPORT &&                                                  \
+    !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+      defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+ * Android API level 24. See
+ * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */
 #include <sys/types.h> /* NOLINT */
 typedef off_t FileOffset;
 /* Use 32-bit file operations in WebM file format when building ARM
@@ -110,30 +115,44 @@ extern "C" {
 
 #if defined(__GNUC__)
 #define VPX_NO_RETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define VPX_NO_RETURN __declspec(noreturn)
 #else
 #define VPX_NO_RETURN
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef VPX_TOOLS_FORMAT_PRINTF
+#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...) VPX_NO_RETURN;
-void fatal(const char *fmt, ...) VPX_NO_RETURN;
-void warn(const char *fmt, ...);
+VPX_NO_RETURN void die(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
+VPX_NO_RETURN void fatal(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
+void warn(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
 
-void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
+VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s);
 
 /* The tool including this file must define usage_exit() */
-void usage_exit(void) VPX_NO_RETURN;
+VPX_NO_RETURN void usage_exit(void);
 
 #undef VPX_NO_RETURN
 
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 
 typedef struct VpxInterface {
-  const char *const name;
-  const uint32_t fourcc;
-  vpx_codec_iface_t *(*const codec_interface)();
+  const char *name;
+  uint32_t fourcc;
+  vpx_codec_iface_t *(*codec_interface)(void);
 } VpxInterface;
 
 int get_vpx_encoder_count(void);
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index 48e86d3278..ee3c281f0f 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -16,7 +16,7 @@
 #include "vpx_ports/mem.h"
 
 static const int8_t vp8_sub_pel_filters[8][8] = {
-  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positionyys are */
+  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positions are */
   { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
   { 0, -9, 93, 50, -6, 0, 0, 0 },
@@ -781,7 +781,6 @@ void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
   vst1_u8(dst_ptr, d8u8);
   dst_ptr += dst_pitch;
   vst1_u8(dst_ptr, d9u8);
-  return;
 }
 
 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
@@ -1250,7 +1249,6 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
     vst1_u8(dst_ptr, d9u8);
     dst_ptr += dst_pitch;
   }
-  return;
 }
 
 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
@@ -1504,7 +1502,9 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
     src += src_pixels_per_line;
     d12u8 = vld1_u8(src);
     d13u8 = vld1_u8(src + 8);
-    d14u8 = vld1_u8(src + 16);
+    // Only 5 pixels are needed, avoid a potential out of bounds read.
+    d14u8 = vld1_u8(src + 13);
+    d14u8 = vext_u8(d14u8, d14u8, 3);
     src += src_pixels_per_line;
 
     __builtin_prefetch(src);
@@ -1726,5 +1726,4 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
       dst += dst_pitch;
     }
   }
-  return;
 }
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 02abe053cb..8300aad941 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -58,7 +58,7 @@ typedef struct {
 extern const unsigned char vp8_block2left[25];
 extern const unsigned char vp8_block2above[25];
 
-#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B);
+#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B)
 
 typedef enum { KEY_FRAME = 0, INTER_FRAME = 1 } FRAME_TYPE;
 
@@ -251,7 +251,7 @@ typedef struct macroblockd {
   unsigned char update_mb_segmentation_data;
 
   /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
-  unsigned char mb_segement_abs_delta;
+  unsigned char mb_segment_abs_delta;
 
   /* Per frame flags that define which MB level features (such as quantizer or
    * loop filter level) */
diff --git a/vp8/common/common.h b/vp8/common/common.h
index 2c30e8d6c5..562569f9ab 100644
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -24,22 +24,22 @@ extern "C" {
 /* Only need this for fixed-size arrays, for structs just assign. */
 
 #define vp8_copy(Dest, Src)              \
-  {                                      \
+  do {                                   \
     assert(sizeof(Dest) == sizeof(Src)); \
     memcpy(Dest, Src, sizeof(Src));      \
-  }
+  } while (0)
 
 /* Use this for variably-sized arrays. */
 
 #define vp8_copy_array(Dest, Src, N)           \
-  {                                            \
+  do {                                         \
     assert(sizeof(*(Dest)) == sizeof(*(Src))); \
     memcpy(Dest, Src, (N) * sizeof(*(Src)));   \
-  }
+  } while (0)
 
-#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest));
+#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest))
 
-#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest)));
+#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest)))
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index 6889fdedde..3b31923621 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -105,9 +105,9 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
     tmp = near_mv_ref_cnts[CNT_NEAREST];
     near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR];
     near_mv_ref_cnts[CNT_NEAR] = tmp;
-    tmp = near_mvs[CNT_NEAREST].as_int;
+    tmp = (int)near_mvs[CNT_NEAREST].as_int;
     near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
-    near_mvs[CNT_NEAR].as_int = tmp;
+    near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
   }
 
   /* Use near_mvs[0] to store the "best" MV */
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 75ce7ef359..71529bdfd8 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -16,6 +16,10 @@
 #include "vpx_ports/x86.h"
 #elif VPX_ARCH_PPC
 #include "vpx_ports/ppc.h"
+#elif VPX_ARCH_MIPS
+#include "vpx_ports/mips.h"
+#elif VPX_ARCH_LOONGARCH
+#include "vpx_ports/loongarch.h"
 #endif
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/systemdependent.h"
@@ -96,6 +100,10 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) {
   ctx->cpu_caps = x86_simd_caps();
 #elif VPX_ARCH_PPC
   ctx->cpu_caps = ppc_simd_caps();
+#elif VPX_ARCH_MIPS
+  ctx->cpu_caps = mips_cpu_caps();
+#elif VPX_ARCH_LOONGARCH
+  ctx->cpu_caps = loongarch_cpu_caps();
 #else
   // generic-gnu targets.
   ctx->cpu_caps = 0;
diff --git a/vp8/common/loongarch/idct_lsx.c b/vp8/common/loongarch/idct_lsx.c
new file mode 100644
index 0000000000..eee871eec4
--- /dev/null
+++ b/vp8/common/loongarch/idct_lsx.c
@@ -0,0 +1,322 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static const int32_t cospi8sqrt2minus1 = 20091;
+static const int32_t sinpi8sqrt2 = 35468;
+
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)    \
+  do {                                                                    \
+    __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+                                                                          \
+    DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m);         \
+    DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m);         \
+    DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+    DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+  } while (0)
+
+#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+  do {                                                                  \
+    __m128i s4_m, s5_m, s6_m, s7_m;                                     \
+                                                                        \
+    TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);     \
+    DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2);       \
+    out1 = __lsx_vilvh_d(s6_m, s4_m);                                   \
+    out3 = __lsx_vilvh_d(s7_m, s5_m);                                   \
+  } while (0)
+
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1)   \
+  do {                                                        \
+    __m128i zero_m = __lsx_vldi(0);                           \
+    __m128i tmp1_m, tmp2_m;                                   \
+    __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \
+                                                              \
+    tmp1_m = __lsx_vilvl_h(in0, zero_m);                      \
+    tmp2_m = __lsx_vilvh_h(in0, zero_m);                      \
+    tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
+    tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
+    tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m);            \
+    tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
+    tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m);            \
+    tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
+    in1 = __lsx_vpickev_h(tmp2_m, tmp1_m);                    \
+  } while (0)
+
+#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)      \
+  do {                                                                 \
+    __m128i a1_m, b1_m, c1_m, d1_m;                                    \
+    __m128i c_tmp1_m, c_tmp2_m;                                        \
+    __m128i d_tmp1_m, d_tmp2_m;                                        \
+    __m128i const_cospi8sqrt2minus1_m;                                 \
+                                                                       \
+    const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \
+    a1_m = __lsx_vadd_h(in0, in2);                                     \
+    b1_m = __lsx_vsub_h(in0, in2);                                     \
+    EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m);          \
+                                                                       \
+    c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m);           \
+    c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1);                             \
+    c_tmp2_m = __lsx_vsrai_h(c_tmp2_m, 1);                             \
+    c_tmp2_m = __lsx_vadd_h(in3, c_tmp2_m);                            \
+    c1_m = __lsx_vsub_h(c_tmp1_m, c_tmp2_m);                           \
+                                                                       \
+    d_tmp1_m = __lsx_vmuh_h(in1, const_cospi8sqrt2minus1_m);           \
+    d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1);                             \
+    d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1);                             \
+    d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m);                            \
+    EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m);          \
+    d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m);                           \
+    LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+  } while (0)
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)      \
+  do {                                                                 \
+    __m128i a1_m, b1_m, c1_m, d1_m;                                    \
+    __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                    \
+    __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;                 \
+                                                                       \
+    const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_w(cospi8sqrt2minus1); \
+    sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2);                  \
+    a1_m = __lsx_vadd_w(in0, in2);                                     \
+    b1_m = __lsx_vsub_w(in0, in2);                                     \
+    c_tmp1_m = __lsx_vmul_w(in1, sinpi8_sqrt2_m);                      \
+    c_tmp1_m = __lsx_vsrai_w(c_tmp1_m, 16);                            \
+    c_tmp2_m = __lsx_vmul_w(in3, const_cospi8sqrt2minus1_m);           \
+    c_tmp2_m = __lsx_vsrai_w(c_tmp2_m, 16);                            \
+    c_tmp2_m = __lsx_vadd_w(in3, c_tmp2_m);                            \
+    c1_m = __lsx_vsub_w(c_tmp1_m, c_tmp2_m);                           \
+    d_tmp1_m = __lsx_vmul_w(in1, const_cospi8sqrt2minus1_m);           \
+    d_tmp1_m = __lsx_vsrai_w(d_tmp1_m, 16);                            \
+    d_tmp1_m = __lsx_vadd_w(in1, d_tmp1_m);                            \
+    d_tmp2_m = __lsx_vmul_w(in3, sinpi8_sqrt2_m);                      \
+    d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16);                            \
+    d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m);                           \
+    LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+  } while (0)
+
+#define UNPCK_SH_SW(in, out0, out1)  \
+  do {                               \
+    out0 = __lsx_vsllwil_w_h(in, 0); \
+    out1 = __lsx_vexth_w_h(in);      \
+  } while (0)
+
+static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred,
+                                 int32_t pred_stride, uint8_t *dest,
+                                 int32_t dest_stride) {
+  __m128i vec, res0, res1, res2, res3, dst0, dst1;
+  __m128i pred0, pred1, pred2, pred3;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+
+  vec = __lsx_vreplgr2vr_h(in_dc);
+  vec = __lsx_vsrari_h(vec, 3);
+  pred0 = __lsx_vld(pred, 0);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred1, pred2);
+  pred3 = __lsx_vldx(pred, pred_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+            res1, res2, res3);
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, dst0, dst1);
+  dst0 = __lsx_vpickev_w(dst1, dst0);
+  __lsx_vstelm_w(dst0, dest, 0, 0);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 1);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 2);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 3);
+}
+
+void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride) {
+  idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+static void dequant_idct4x4_addblk_2x_lsx(int16_t *input,
+                                          int16_t *dequant_input, uint8_t *dest,
+                                          int32_t dest_stride) {
+  __m128i dest0, dest1, dest2, dest3;
+  __m128i in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+  __m128i hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+  __m128i hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
+  __m128i vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t dest_stride2 = dest_stride << 1;
+  int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP2_ARG2(__lsx_vld, dequant_input, 0, dequant_input, 16, dequant_in0,
+            dequant_in1);
+
+  DUP4_ARG2(__lsx_vmul_h, in0, dequant_in0, in1, dequant_in1, in2, dequant_in0,
+            in3, dequant_in1, mul0, mul1, mul2, mul3);
+  DUP2_ARG2(__lsx_vpickev_d, mul2, mul0, mul3, mul1, in0, in2);
+  DUP2_ARG2(__lsx_vpickod_d, mul2, mul0, mul3, mul1, in1, in3);
+
+  VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  UNPCK_SH_SW(hz0, hz0r, hz0l);
+  UNPCK_SH_SW(hz1, hz1r, hz1l);
+  UNPCK_SH_SW(hz2, hz2r, hz2l);
+  UNPCK_SH_SW(hz3, hz3r, hz3l);
+  VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
+  DUP4_ARG2(__lsx_vsrari_w, vt0l, 3, vt1l, 3, vt2l, 3, vt3l, 3, vt0l, vt1l,
+            vt2l, vt3l);
+  VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
+  DUP4_ARG2(__lsx_vsrari_w, vt0r, 3, vt1r, 3, vt2r, 3, vt3r, 3, vt0r, vt1r,
+            vt2r, vt3r);
+  DUP4_ARG2(__lsx_vpickev_h, vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r,
+            vt0, vt1, vt2, vt3);
+  TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  dest0 = __lsx_vld(dest, 0);
+  DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+  dest3 = __lsx_vldx(dest, dest_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0,
+            res1, res2, res3);
+
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, vt0l, vt1l);
+
+  __lsx_vstelm_d(vt0l, dest, 0, 0);
+  __lsx_vstelm_d(vt0l, dest + dest_stride, 0, 1);
+  __lsx_vstelm_d(vt1l, dest + dest_stride2, 0, 0);
+  __lsx_vstelm_d(vt1l, dest + dest_stride3, 0, 1);
+
+  __lsx_vst(zero, input, 0);
+  __lsx_vst(zero, input, 16);
+  __lsx_vst(zero, input, 32);
+  __lsx_vst(zero, input, 48);
+}
+
+static void dequant_idct_addconst_2x_lsx(int16_t *input, int16_t *dequant_input,
+                                         uint8_t *dest, int32_t dest_stride) {
+  __m128i input_dc0, input_dc1, vec, res0, res1, res2, res3;
+  __m128i dest0, dest1, dest2, dest3;
+  __m128i zero = __lsx_vldi(0);
+  int32_t dest_stride2 = dest_stride << 1;
+  int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+  input_dc0 = __lsx_vreplgr2vr_h(input[0] * dequant_input[0]);
+  input_dc1 = __lsx_vreplgr2vr_h(input[16] * dequant_input[0]);
+  DUP2_ARG2(__lsx_vsrari_h, input_dc0, 3, input_dc1, 3, input_dc0, input_dc1);
+  vec = __lsx_vpickev_d(input_dc1, input_dc0);
+  input[0] = 0;
+  input[16] = 0;
+  dest0 = __lsx_vld(dest, 0);
+  DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+  dest3 = __lsx_vldx(dest, dest_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+            res1, res2, res3);
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, res0, res1);
+  __lsx_vstelm_d(res0, dest, 0, 0);
+  __lsx_vstelm_d(res0, dest + dest_stride, 0, 1);
+  __lsx_vstelm_d(res1, dest + dest_stride2, 0, 0);
+  __lsx_vstelm_d(res1, dest + dest_stride3, 0, 1);
+}
+
+void vp8_dequant_idct_add_y_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst,
+                                      int32_t stride, char *eobs) {
+  int16_t *eobs_h = (int16_t *)eobs;
+  uint8_t i;
+
+  for (i = 4; i--;) {
+    if (eobs_h[0]) {
+      if (eobs_h[0] & 0xfefe) {
+        dequant_idct4x4_addblk_2x_lsx(q, dq, dst, stride);
+      } else {
+        dequant_idct_addconst_2x_lsx(q, dq, dst, stride);
+      }
+    }
+
+    q += 32;
+
+    if (eobs_h[1]) {
+      if (eobs_h[1] & 0xfefe) {
+        dequant_idct4x4_addblk_2x_lsx(q, dq, dst + 8, stride);
+      } else {
+        dequant_idct_addconst_2x_lsx(q, dq, dst + 8, stride);
+      }
+    }
+
+    q += 32;
+    dst += (4 * stride);
+    eobs_h += 2;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst_u,
+                                       uint8_t *dst_v, int32_t stride,
+                                       char *eobs) {
+  int16_t *eobs_h = (int16_t *)eobs;
+  if (eobs_h[0]) {
+    if (eobs_h[0] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+    }
+  }
+
+  q += 32;
+  dst_u += (stride * 4);
+
+  if (eobs_h[1]) {
+    if (eobs_h[1] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+    }
+  }
+
+  q += 32;
+
+  if (eobs_h[2]) {
+    if (eobs_h[2] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+    }
+  }
+  q += 32;
+  dst_v += (stride * 4);
+
+  if (eobs_h[3]) {
+    if (eobs_h[3] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+    }
+  }
+}
diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c
new file mode 100644
index 0000000000..79c3ea6dbb
--- /dev/null
+++ b/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev)        \
+  do {                                                       \
+    __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+    const __m128i cnst4b = __lsx_vldi(4);                    \
+    const __m128i cnst3b = __lsx_vldi(3);                    \
+                                                             \
+    p1_m = __lsx_vxori_b(p1, 0x80);                          \
+    p0_m = __lsx_vxori_b(p0, 0x80);                          \
+    q0_m = __lsx_vxori_b(q0, 0x80);                          \
+    q1_m = __lsx_vxori_b(q1, 0x80);                          \
+                                                             \
+    filt = __lsx_vssub_b(p1_m, q1_m);                        \
+    filt = __lsx_vand_v(filt, hev);                          \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vand_v(filt, mask);                         \
+    t1 = __lsx_vsadd_b(filt, cnst4b);                        \
+    t1 = __lsx_vsra_b(t1, cnst3b);                           \
+    t2 = __lsx_vsadd_b(filt, cnst3b);                        \
+    t2 = __lsx_vsra_b(t2, cnst3b);                           \
+    q0_m = __lsx_vssub_b(q0_m, t1);                          \
+    q0 = __lsx_vxori_b(q0_m, 0x80);                          \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                          \
+    p0 = __lsx_vxori_b(p0_m, 0x80);                          \
+    filt = __lsx_vsrari_b(t1, 1);                            \
+    hev = __lsx_vxori_b(hev, 0xff);                          \
+    filt = __lsx_vand_v(filt, hev);                          \
+    q1_m = __lsx_vssub_b(q1_m, filt);                        \
+    q1 = __lsx_vxori_b(q1_m, 0x80);                          \
+    p1_m = __lsx_vsadd_b(p1_m, filt);                        \
+    p1 = __lsx_vxori_b(p1_m, 0x80);                          \
+  } while (0)
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
+  do {                                                  \
+    __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;         \
+    __m128i u, filt, t1, t2, filt_sign, q0_sub_p0;      \
+    __m128i filt_r, filt_l;                             \
+    __m128i temp0, temp1, temp2, temp3;                 \
+    const __m128i cnst4b = __lsx_vldi(4);               \
+    const __m128i cnst3b = __lsx_vldi(3);               \
+    const __m128i cnst9h = __lsx_vldi(1033);            \
+    const __m128i cnst63h = __lsx_vldi(1087);           \
+                                                        \
+    p2_m = __lsx_vxori_b(p2, 0x80);                     \
+    p1_m = __lsx_vxori_b(p1, 0x80);                     \
+    p0_m = __lsx_vxori_b(p0, 0x80);                     \
+    q0_m = __lsx_vxori_b(q0, 0x80);                     \
+    q1_m = __lsx_vxori_b(q1, 0x80);                     \
+    q2_m = __lsx_vxori_b(q2, 0x80);                     \
+                                                        \
+    filt = __lsx_vssub_b(p1_m, q1_m);                   \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vand_v(filt, mask);                    \
+                                                        \
+    t2 = __lsx_vand_v(filt, hev);                       \
+    hev = __lsx_vxori_b(hev, 0xff);                     \
+    filt = __lsx_vand_v(hev, filt);                     \
+    t1 = __lsx_vsadd_b(t2, cnst4b);                     \
+    t1 = __lsx_vsra_b(t1, cnst3b);                      \
+    t2 = __lsx_vsadd_b(t2, cnst3b);                     \
+    t2 = __lsx_vsra_b(t2, cnst3b);                      \
+    q0_m = __lsx_vssub_b(q0_m, t1);                     \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                     \
+    filt_sign = __lsx_vslti_b(filt, 0);                 \
+    filt_r = __lsx_vilvl_b(filt_sign, filt);            \
+    filt_l = __lsx_vilvh_b(filt_sign, filt);            \
+    temp0 = __lsx_vmul_h(filt_r, cnst9h);               \
+    temp1 = __lsx_vadd_h(temp0, cnst63h);               \
+    temp2 = __lsx_vmul_h(filt_l, cnst9h);               \
+    temp3 = __lsx_vadd_h(temp2, cnst63h);               \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q2_m = __lsx_vssub_b(q2_m, u);                      \
+    p2_m = __lsx_vsadd_b(p2_m, u);                      \
+    q2 = __lsx_vxori_b(q2_m, 0x80);                     \
+    p2 = __lsx_vxori_b(p2_m, 0x80);                     \
+                                                        \
+    temp1 = __lsx_vadd_h(temp1, temp0);                 \
+    temp3 = __lsx_vadd_h(temp3, temp2);                 \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q1_m = __lsx_vssub_b(q1_m, u);                      \
+    p1_m = __lsx_vsadd_b(p1_m, u);                      \
+    q1 = __lsx_vxori_b(q1_m, 0x80);                     \
+    p1 = __lsx_vxori_b(p1_m, 0x80);                     \
+                                                        \
+    temp1 = __lsx_vadd_h(temp1, temp0);                 \
+    temp3 = __lsx_vadd_h(temp3, temp2);                 \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q0_m = __lsx_vssub_b(q0_m, u);                      \
+    p0_m = __lsx_vsadd_b(p0_m, u);                      \
+    q0 = __lsx_vxori_b(q0_m, 0x80);                     \
+    p0 = __lsx_vxori_b(p0_m, 0x80);                     \
+  } while (0)
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  do {                                                                       \
+    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
+    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
+                                                                             \
+    p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in);                             \
+    p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in);                             \
+    p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in);                             \
+    q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in);                             \
+    q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in);                             \
+    q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in);                             \
+    p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in);                             \
+    p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in);                             \
+    flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = __lsx_vslt_bu(thresh_in, flat_out);                            \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                           \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);               \
+    mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m);                      \
+    mask_out = __lsx_vmax_bu(flat_out, mask_out);                            \
+    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
+    mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
+    mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
+  } while (0)
+
+#define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \
+  do {                                                        \
+    __lsx_vstelm_w(in0, pdst, 0, in0_idx);                    \
+    __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx);           \
+  } while (0)
+
+static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit0_ptr,
+                                              const uint8_t *limit0_ptr,
+                                              const uint8_t *thresh0_ptr,
+                                              const uint8_t *b_limit1_ptr,
+                                              const uint8_t *limit1_ptr,
+                                              const uint8_t *thresh1_ptr) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src,
+            -pitch, p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2);
+  q3 = __lsx_vldx(src, pitch_x3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  __lsx_vstx(p1, src, -pitch_x2);
+  __lsx_vstx(p0, src, -pitch);
+  __lsx_vst(q0, src, 0);
+  __lsx_vstx(q1, src, pitch);
+}
+
+static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                            const uint8_t *b_limit0_ptr,
+                                            const uint8_t *limit0_ptr,
+                                            const uint8_t *thresh0_ptr,
+                                            const uint8_t *b_limit1_ptr,
+                                            const uint8_t *limit1_ptr,
+                                            const uint8_t *thresh1_ptr) {
+  uint8_t *src_tmp0 = src - 4;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  row0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2);
+  row3 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+  row4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6);
+  row7 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+
+  row8 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10);
+  row11 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+  row12 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14);
+  row15 = __lsx_vldx(src_tmp0, pitch_x3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src -= 2;
+  __lsx_vstelm_w(tmp2, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp3, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp4, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp5, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 3);
+}
+
+static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+  thresh = __lsx_vreplgr2vr_b(thresh_in);
+  limit = __lsx_vreplgr2vr_b(limit_in);
+  b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+  DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2,
+            src_u, -pitch, p3_u, p2_u, p1_u, p0_u);
+  q0_u = __lsx_vld(src_u, 0);
+  DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u);
+  q3_u = __lsx_vldx(src_u, pitch_x3);
+
+  DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2,
+            src_v, -pitch, p3_v, p2_v, p1_v, p0_v);
+  q0_v = __lsx_vld(src_v, 0);
+  DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v);
+  q3_v = __lsx_vldx(src_v, pitch_x3);
+
+  /* right 8 element of p3 are u pixel and
+     left 8 element of p3 are v pixel */
+  DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+            q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  __lsx_vstelm_d(q1, src_u + pitch, 0, 0);
+  __lsx_vstelm_d(q0, src_u, 0, 0);
+  __lsx_vstelm_d(p0, src_u - pitch, 0, 0);
+  __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0);
+
+  __lsx_vstelm_d(q1, src_v + pitch, 0, 1);
+  __lsx_vstelm_d(q0, src_v, 0, 1);
+  __lsx_vstelm_d(p0, src_v - pitch, 0, 1);
+  __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1);
+}
+
+static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+                                             int32_t pitch,
+                                             const uint8_t b_limit_in,
+                                             const uint8_t limit_in,
+                                             const uint8_t thresh_in) {
+  uint8_t *src_u_tmp, *src_v_tmp;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  thresh = __lsx_vreplgr2vr_b(thresh_in);
+  limit = __lsx_vreplgr2vr_b(limit_in);
+  b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+  src_u_tmp = src_u - 4;
+  row0 = __lsx_vld(src_u_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2);
+  row3 = __lsx_vldx(src_u_tmp, pitch_x3);
+  src_u_tmp += pitch_x4;
+  row4 = __lsx_vld(src_u_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6);
+  row7 = __lsx_vldx(src_u_tmp, pitch_x3);
+
+  src_v_tmp = src_v - 4;
+  row8 = __lsx_vld(src_v_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10);
+  row11 = __lsx_vldx(src_v_tmp, pitch_x3);
+  src_v_tmp += pitch_x4;
+  row12 = __lsx_vld(src_v_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14);
+  row15 = __lsx_vldx(src_v_tmp, pitch_x3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+
+  tmp0 = __lsx_vilvh_b(p0, p1);
+  tmp1 = __lsx_vilvh_b(q1, q0);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src_u_tmp += 2;
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3);
+
+  __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3);
+
+  src_v_tmp += 2;
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3);
+
+  __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3);
+}
+
+static inline void mbloop_filter_horizontal_edge_y_lsx(
+    uint8_t *src, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  temp_src = src - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3, p2, p1, p0);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  temp_src = src - pitch_x3;
+  __lsx_vstx(p2, temp_src, 0);
+  __lsx_vstx(p1, temp_src, pitch);
+  __lsx_vstx(p0, temp_src, pitch_x2);
+  __lsx_vstx(q0, temp_src, pitch_x3);
+  temp_src += pitch_x4;
+  __lsx_vstx(q1, temp_src, 0);
+  __lsx_vstx(q2, temp_src, pitch);
+}
+
+static inline void mbloop_filter_horizontal_edge_uv_lsx(
+    uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  temp_src = src_u - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3_u, p2_u, p1_u, p0_u);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0_u, q1_u, q2_u, q3_u);
+  temp_src = src_v - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3_v, p2_v, p1_v, p0_v);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0_v, q1_v, q2_v, q3_v);
+
+  DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+            q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  src_u -= pitch_x3;
+  __lsx_vstelm_d(p2, src_u, 0, 0);
+  __lsx_vstelm_d(p1, src_u + pitch, 0, 0);
+  __lsx_vstelm_d(p0, src_u + pitch_x2, 0, 0);
+  __lsx_vstelm_d(q0, src_u + pitch_x3, 0, 0);
+  src_u += pitch_x4;
+  __lsx_vstelm_d(q1, src_u, 0, 0);
+  src_u += pitch;
+  __lsx_vstelm_d(q2, src_u, 0, 0);
+
+  src_v -= pitch_x3;
+  __lsx_vstelm_d(p2, src_v, 0, 1);
+  __lsx_vstelm_d(p1, src_v + pitch, 0, 1);
+  __lsx_vstelm_d(p0, src_v + pitch_x2, 0, 1);
+  __lsx_vstelm_d(q0, src_v + pitch_x3, 0, 1);
+  src_v += pitch_x4;
+  __lsx_vstelm_d(q1, src_v, 0, 1);
+  src_v += pitch;
+  __lsx_vstelm_d(q2, src_v, 0, 1);
+}
+
+static inline void mbloop_filter_vertical_edge_y_lsx(uint8_t *src,
+                                                     int32_t pitch,
+                                                     const uint8_t b_limit_in,
+                                                     const uint8_t limit_in,
+                                                     const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+  temp_src = src - 4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row0, row1, row2, row3);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row4, row5, row6, row7);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row8, row9, row10, row11);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row12, row13, row14, row15);
+  temp_src -= pitch_x4;
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+  tmp2 = __lsx_vilvl_b(q2, q1);
+  tmp5 = __lsx_vilvh_b(q2, q1);
+
+  temp_src = src - 3;
+  VP8_ST6x1_B(tmp3, 0, tmp2, 0, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 1, tmp2, 1, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 2, tmp2, 2, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 3, tmp2, 3, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 0, tmp2, 4, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 1, tmp2, 5, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 2, tmp2, 6, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 3, tmp2, 7, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 0, tmp5, 0, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 1, tmp5, 1, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 2, tmp5, 2, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 3, tmp5, 3, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 0, tmp5, 4, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 1, tmp5, 5, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 2, tmp5, 6, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+static inline void mbloop_filter_vertical_edge_uv_lsx(
+    uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  src_u -= 4;
+  DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+            pitch_x3, row0, row1, row2, row3);
+  src_u += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+            pitch_x3, row4, row5, row6, row7);
+  src_v -= 4;
+  DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+            pitch_x3, row8, row9, row10, row11);
+  src_v += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+            pitch_x3, row12, row13, row14, row15);
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+  tmp2 = __lsx_vilvl_b(q2, q1);
+  tmp5 = __lsx_vilvh_b(q2, q1);
+
+  src_u += 1 - pitch_x4;
+  VP8_ST6x1_B(tmp3, 0, tmp2, 0, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 1, tmp2, 1, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 2, tmp2, 2, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 3, tmp2, 3, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 0, tmp2, 4, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 1, tmp2, 5, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 2, tmp2, 6, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 3, tmp2, 7, src_u, 4);
+
+  src_v += 1 - pitch_x4;
+  VP8_ST6x1_B(tmp6, 0, tmp5, 0, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 1, tmp5, 1, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 2, tmp5, 2, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 3, tmp5, 3, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 0, tmp5, 4, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 1, tmp5, 5, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 2, tmp5, 6, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void vp8_loop_filter_mbh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                             int32_t pitch_y, int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr) {
+  mbloop_filter_horizontal_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+                                      *lpf_info_ptr->lim,
+                                      *lpf_info_ptr->hev_thr);
+  if (src_u) {
+    mbloop_filter_horizontal_edge_uv_lsx(
+        src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+        *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                             int32_t pitch_y, int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr) {
+  mbloop_filter_vertical_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+                                    *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+  if (src_u) {
+    mbloop_filter_vertical_edge_uv_lsx(src_u, src_v, pitch_u_v,
+                                       *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+                                       *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                            int32_t pitch_y, int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr) {
+  loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  if (src_u) {
+    loop_filter_horizontal_edge_uv_lsx(
+        src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v,
+        *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                            int32_t pitch_y, int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr) {
+  loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  if (src_u) {
+    loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v,
+                                     *lpf_info_ptr->blim, *lpf_info_ptr->lim,
+                                     *lpf_info_ptr->hev_thr);
+  }
+}
diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c
new file mode 100644
index 0000000000..9867633415
--- /dev/null
+++ b/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = {
+  { 0, -6, 123, 12, -1, 0, 0, 0 },
+  { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
+  { 0, -9, 93, 50, -6, 0, 0, 0 },
+  { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
+  { 0, -6, 50, 93, -9, 0, 0, 0 },
+  { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
+  { 0, -1, 12, 123, -6, 0, 0, 0 },
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2,
+                               __m128i coeff0, __m128i coeff1, __m128i coeff2) {
+  __m128i out0_m;
+
+  out0_m = __lsx_vdp2_h_b(in0, coeff0);
+  out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);
+  out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);
+
+  return out0_m;
+}
+
+static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+                                      __m128i mask1, __m128i mask2,
+                                      __m128i filt_h0, __m128i filt_h1,
+                                      __m128i filt_h2) {
+  __m128i vec0_m, vec1_m, vec2_m;
+  __m128i hz_out_m;
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+            vec1_m);
+  vec2_m = __lsx_vshuf_b(src1, src0, mask2);
+  hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);
+  hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+  hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+  return hz_out_m;
+}
+
+static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1,
+                                        __m128i filt0, __m128i filt1) {
+  __m128i tmp_m;
+
+  tmp_m = __lsx_vdp2_h_b(vec0, filt0);
+  tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1);
+
+  return tmp_m;
+}
+
+static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+                                      __m128i mask1, __m128i filt_h0,
+                                      __m128i filt_h1) {
+  __m128i vec0_m, vec1_m, hz_out_m;
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+            vec1_m);
+  hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1);
+  hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+  hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+  return hz_out_m;
+}
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   mask2, filt0, filt1, filt2, out0, out1) \
+  do {                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+              out0, out1);                                                 \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \
+              vec5_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
+              out0, out1);                                                 \
+  } while (0)
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   mask2, filt0, filt1, filt2, out0, out1,  \
+                                   out2, out3)                              \
+  do {                                                                      \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+                                                                            \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m,  \
+              vec1_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m,  \
+              vec3_m);                                                      \
+    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0,  \
+              vec3_m, filt0, out0, out1, out2, out3);                       \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m,  \
+              vec1_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m,  \
+              vec3_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m,  \
+              vec5_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m,  \
+              vec7_m);                                                      \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1,  \
+              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,   \
+              out3);                                                        \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2,  \
+              out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2,   \
+              out3);                                                        \
+  } while (0)
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1)               \
+  do {                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+              out0, out1);                                                 \
+  } while (0)
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1, out2, out3)   \
+  do {                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \
+              vec3_m);                                                     \
+    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
+              vec3_m, filt0, out0, out1, out2, out3);                      \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
+              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,  \
+              out3);                                                       \
+  } while (0)
+
+static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src,
+                                        int32_t src_stride,
+                                        uint8_t *RESTRICT dst,
+                                        int32_t dst_stride,
+                                        const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1);
+  out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+  out0 = __lsx_vxori_b(out0, 128);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out0, out1, out2, out3;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride_x2 << 1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src += src_stride_x4;
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out2, out3);
+
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, tmp0, tmp1;
+  __m128i filt, out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 2;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src += src_stride_x4;
+  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1, out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+  dst += dst_stride_x4;
+
+  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    src += src_stride_x4;
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out;
+  __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 2;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src2, src4, src6);
+    src += 8;
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src1, src3, src5, src7);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+              src5, src6, src7);
+    src += src_stride_x4 - 8;
+
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out4, out5, out6, out7);
+    DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT,
+              out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2,
+              out3);
+    DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT,
+              out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6,
+              out7);
+    DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1,
+              out2, out3);
+    DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5,
+              out6, out7);
+    out = __lsx_vpickev_b(out1, out0);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vst(out, dst, 0);
+    out = __lsx_vpickev_b(out3, out2);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride);
+    out = __lsx_vpickev_b(out5, out4);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride_x2);
+    out = __lsx_vpickev_b(out7, out6);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+  __m128i out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+
+  DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+  src2 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+            src4332);
+  DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_r, src65_r, src76_r, src87_r);
+    DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554,
+              src8776);
+    DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776);
+    out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2);
+    out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2);
+
+    out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+    out0 = __lsx_vxori_b(out0, 128);
+
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src4 = src8;
+  }
+}
+
+static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
+  __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+  __m128i src109_r, filt0, filt1, filt2;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= src_stride_x2;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3,
+            src10_r, src32_r, src21_r, src43_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src7, src8, src9, src10);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9,
+              src76_r, src87_r, src98_r, src109_r);
+    out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+    out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+    out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+    out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+              out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    src10_r = src76_r;
+    src32_r = src98_r;
+    src21_r = src87_r;
+    src43_r = src109_r;
+    src4 = src10;
+  }
+}
+
+static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+  __m128i src65_l, src87_l, filt0, filt1, filt2;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= src_stride_x2;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vldx(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
+            src10_r, src32_r, src43_r, src21_r);
+  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
+            src10_l, src32_l, src43_l, src21_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_r, src65_r, src76_r, src87_r);
+    DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_l, src65_l, src76_l, src87_l);
+    out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+    out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+    out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+    out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+              out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+              out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+              tmp1, tmp2, tmp3);
+    __lsx_vstx(tmp0, dst, 0);
+    __lsx_vstx(tmp1, dst, dst_stride);
+    __lsx_vstx(tmp2, dst, dst_stride_x2);
+    __lsx_vstx(tmp3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src10_l = src54_l;
+    src32_l = src76_l;
+    src21_l = src65_l;
+    src43_l = src87_l;
+    src4 = src8;
+  }
+}
+
+static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+  src2 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+
+  hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    src6 = __lsx_vldx(src, src_stride);
+    src += src_stride_x2;
+
+    DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+    hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+
+    src7 = __lsx_vld(src, 0);
+    src8 = __lsx_vldx(src, src_stride);
+    src += src_stride_x2;
+
+    DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8);
+    hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out3 = hz_out7;
+    out0 = out2;
+    out1 = out3;
+  }
+}
+
+static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, filt_hz2;
+  __m128i mask0, mask1, mask2, vec0, vec1;
+  __m128i filt, filt_vt0, filt_vt1, filt_vt2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= (2 + src_stride_x2);
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  filt_hz2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vldx(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+
+  hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+  filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2,
+            hz_out1, hz_out4, hz_out3, out0, out1, out3, out4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+    tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out7 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out6 = __lsx_vpackev_b(hz_out8, hz_out7);
+    tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2,
+              VP8_FILTER_SHIFT, vec0, vec1);
+    DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    hz_out4 = hz_out8;
+    out0 = out2;
+    out1 = out7;
+    out3 = out5;
+    out4 = out6;
+  }
+}
+
+static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out0, out1);
+
+  out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+  out0 = __lsx_vxori_b(out0, 128);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  src += src_stride_x4;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out0, out1);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src1, src2, src3);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i filt0, filt1, mask0, mask1;
+  __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src2, src4, src6);
+    src += 8;
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src1, src3, src5, src7);
+    src += src_stride_x4 - 8;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+              src5, src6, src7);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                               filt1, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6,
+              VP8_FILTER_SHIFT, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0,
+              out1, out2, out3);
+    __lsx_vstx(out0, dst, 0);
+    __lsx_vstx(out1, dst, dst_stride);
+    __lsx_vstx(out2, dst, dst_stride_x2);
+    __lsx_vstx(out3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5;
+  __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+  __m128i src2110, src4332, filt0, filt1, out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src1 = __lsx_vld(src, 0);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+  src2110 = __lsx_vilvl_d(src21_r, src10_r);
+  src2110 = __lsx_vxori_b(src2110, 128);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src += src_stride_x3;
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = __lsx_vilvl_d(src43_r, src32_r);
+    src4332 = __lsx_vxori_b(src4332, 128);
+    out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1);
+
+    src2 = __lsx_vld(src, 0);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r);
+    src2110 = __lsx_vilvl_d(src65_r, src54_r);
+    src2110 = __lsx_vxori_b(src2110, 128);
+    out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1);
+    out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+    out0 = __lsx_vxori_b(out0, 128);
+
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+  }
+}
+
+static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src7, src8, src9, src10;
+  __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= src_stride;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src7, src8, src9, src10);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
+              src72_r, src87_r, src98_r, src109_r);
+    out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1);
+    out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1);
+    out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1);
+    out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+              out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    src10_r = src98_r;
+    src21_r = src109_r;
+    src2 = src10;
+  }
+}
+
+static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+  __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= src_stride;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+    DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5,
+              src32_r, src43_r, src54_r, src65_r);
+    DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5,
+              src32_l, src43_l, src54_l, src65_l);
+    out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1);
+    out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1);
+    out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1);
+    out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1);
+    out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1);
+    out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1);
+    out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1);
+    out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+              out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+              out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+              tmp1, tmp2, tmp3);
+    __lsx_vstx(tmp0, dst, 0);
+    __lsx_vstx(tmp1, dst, dst_stride);
+    __lsx_vstx(tmp2, dst, dst_stride_x2);
+    __lsx_vstx(tmp3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+
+    src10_r = src54_r;
+    src21_r = src65_r;
+    src10_l = src54_l;
+    src21_l = src65_l;
+    src2 = src6;
+  }
+}
+
+static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+  __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src1 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src6 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+
+    DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4);
+    hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+    hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out1 = hz_out5;
+    vec0 = vec2;
+  }
+}
+
+static inline void common_hv_4ht_4vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+  __m128i mask0, mask1, out0, out1;
+  __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3;
+  __m128i vec0, vec1, vec2, vec3, vec4;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1 + src_stride;
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+    hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+    tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
+
+    hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    vec4 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1);
+
+    hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1);
+    tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    vec0 = vec4;
+    vec2 = vec1;
+  }
+}
+
+static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  src1 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+
+  hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src6 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+
+    hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp1, dst, 0, 1);
+    dst += dst_stride;
+
+    hz_out1 = hz_out5;
+    vec0 = vec2;
+  }
+}
+
+static inline void common_hv_6ht_4vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+  __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+  __m128i out0, out1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= (2 + src_stride);
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  filt_hz2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+
+    hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+    tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
+
+    hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1);
+
+    hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
+    tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride,
+            src, src_stride_x2, src0, src1, src3, src4);
+  src2 = __lsx_vld(src, 0);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride_x3);
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    src += src_stride_x4;
+
+    hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+    out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out3 = hz_out7;
+    out0 = out2;
+    out1 = out3;
+  }
+}
+
+static inline void common_hv_4ht_6vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, mask0, mask1;
+  __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i vec0, vec1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1 + src_stride_x2;
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+  filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+    tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+    out6 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+    out7 = __lsx_vpackev_b(hz_out8, hz_out7);
+    tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1);
+    DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+    hz_out4 = hz_out8;
+    out0 = out2;
+    out1 = out6;
+    out3 = out5;
+    out4 = out7;
+  }
+}
+
+static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+typedef void (*PVp8SixtapPredictFunc1)(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height);
+
+typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src,
+                                       int32_t src_stride,
+                                       uint8_t *RESTRICT dst,
+                                       int32_t dst_stride, const int8_t *filter,
+                                       int32_t height);
+
+void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = {
+    common_hv_6ht_6vt_4w_lsx,
+    common_hv_6ht_4vt_4w_lsx,
+    common_hv_4ht_6vt_4w_lsx,
+    common_hv_4ht_4vt_4w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx,
+                                                        common_vt_4t_4w_lsx,
+                                                        common_hz_6t_4w_lsx,
+                                                        common_hz_4t_4w_lsx };
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter, 4);
+                break;
+              case 1:
+                Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter + 1, 4);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict4x4Funcs1[2](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter, 4);
+                break;
+
+              case 1:
+                Predict4x4Funcs1[3](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter + 1, 4);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4);
+            break;
+
+          case 1:
+            Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+                                4);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset) {
+        case 0: {
+          __m128i tp0;
+
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+          dst += dst_stride;
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+          dst += dst_stride;
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+          dst += dst_stride;
+          tp0 = __lsx_vldrepl_w(src, 0);
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+
+          break;
+        }
+        case 2:
+        case 4:
+        case 6:
+          Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4);
+          break;
+      }
+      switch (xoffset & 1) {
+        case 1:
+          Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                              4);
+          break;
+      }
+    }
+  }
+}
+
+void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = {
+    common_hv_6ht_6vt_8w_lsx,
+    common_hv_6ht_4vt_8w_lsx,
+    common_hv_4ht_6vt_8w_lsx,
+    common_hv_4ht_4vt_8w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx,
+                                                        common_vt_4t_8w_lsx,
+                                                        common_hz_6t_8w_lsx,
+                                                        common_hz_4t_8w_lsx };
+
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter, 8);
+                break;
+
+              case 1:
+                Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter + 1, 8);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict8x8Funcs1[2](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter, 8);
+                break;
+
+              case 1:
+                Predict8x8Funcs1[3](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter + 1, 8);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8);
+            break;
+
+          case 1:
+            Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+                                8);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset & 1) {
+        case 1:
+          Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                              8);
+          break;
+      }
+      switch (xoffset) {
+        case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
+        case 2:
+        case 4:
+        case 6:
+          Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8);
+          break;
+      }
+    }
+  }
+}
+
+void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = {
+    common_hv_6ht_6vt_16w_lsx,
+    common_hv_6ht_4vt_16w_lsx,
+    common_hv_4ht_6vt_16w_lsx,
+    common_hv_4ht_4vt_16w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = {
+    common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx,
+    common_hz_4t_16w_lsx
+  };
+
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict16x16Funcs1[0](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter, 16);
+                break;
+
+              case 1:
+                Predict16x16Funcs1[1](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter + 1, 16);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict16x16Funcs1[2](src, src_stride, dst, dst_stride,
+                                      h_filter + 1, v_filter, 16);
+                break;
+
+              case 1:
+                Predict16x16Funcs1[3](src, src_stride, dst, dst_stride,
+                                      h_filter + 1, v_filter + 1, 16);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter,
+                                  16);
+            break;
+
+          case 1:
+            Predict16x16Funcs2[1](src, src_stride, dst, dst_stride,
+                                  v_filter + 1, 16);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset & 1) {
+        case 1:
+          Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                                16);
+          break;
+      }
+      switch (xoffset) {
+        case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
+        case 2:
+        case 4:
+        case 6:
+          Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16);
+          break;
+      }
+    }
+  }
+}
diff --git a/vp8/common/mips/dspr2/filter_dspr2.c b/vp8/common/mips/dspr2/filter_dspr2.c
index e46827b0e4..b9da52084d 100644
--- a/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/vp8/common/mips/dspr2/filter_dspr2.c
@@ -816,8 +816,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     __asm__ __volatile__(
         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
@@ -832,8 +832,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     __asm__ __volatile__(
         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
@@ -848,8 +848,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     output_ptr += 48;
   }
diff --git a/vp8/common/mips/mmi/dequantize_mmi.c b/vp8/common/mips/mmi/dequantize_mmi.c
index b3f8084aee..b9330a6663 100644
--- a/vp8/common/mips/mmi/dequantize_mmi.c
+++ b/vp8/common/mips/mmi/dequantize_mmi.c
@@ -100,7 +100,7 @@ void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest,
   vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride);
 
   __asm__ volatile(
-      "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
       "gssdlc1    %[ftmp0],   0x07(%[input])                  \n\t"
       "gssdrc1    %[ftmp0],   0x00(%[input])                  \n\t"
       "sdl        $0,         0x0f(%[input])                  \n\t"
diff --git a/vp8/common/mips/mmi/idctllm_mmi.c b/vp8/common/mips/mmi/idctllm_mmi.c
index 5e48f59166..a35689dd30 100644
--- a/vp8/common/mips/mmi/idctllm_mmi.c
+++ b/vp8/common/mips/mmi/idctllm_mmi.c
@@ -13,25 +13,25 @@
 #include "vpx_ports/asmdefs_mmi.h"
 
 #define TRANSPOSE_4H \
-  "xor           %[ftmp0],    %[ftmp0],    %[ftmp0]          \n\t" \
+  "pxor          %[ftmp0],    %[ftmp0],    %[ftmp0]          \n\t" \
   MMI_LI(%[tmp0], 0x93)                                            \
   "mtc1          %[tmp0],     %[ftmp10]                      \n\t" \
   "punpcklhw     %[ftmp5],    %[ftmp1],    %[ftmp0]          \n\t" \
   "punpcklhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
   "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
-  "or            %[ftmp5],    %[ftmp5],    %[ftmp9]          \n\t" \
+  "por           %[ftmp5],    %[ftmp5],    %[ftmp9]          \n\t" \
   "punpckhhw     %[ftmp6],    %[ftmp1],    %[ftmp0]          \n\t" \
   "punpckhhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
   "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
-  "or            %[ftmp6],    %[ftmp6],    %[ftmp9]          \n\t" \
+  "por           %[ftmp6],    %[ftmp6],    %[ftmp9]          \n\t" \
   "punpcklhw     %[ftmp7],    %[ftmp3],    %[ftmp0]          \n\t" \
   "punpcklhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
   "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
-  "or            %[ftmp7],    %[ftmp7],    %[ftmp9]          \n\t" \
+  "por           %[ftmp7],    %[ftmp7],    %[ftmp9]          \n\t" \
   "punpckhhw     %[ftmp8],    %[ftmp3],    %[ftmp0]          \n\t" \
   "punpckhhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
   "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
-  "or            %[ftmp8],    %[ftmp8],    %[ftmp9]          \n\t" \
+  "por           %[ftmp8],    %[ftmp8],    %[ftmp9]          \n\t" \
   "punpcklwd     %[ftmp1],    %[ftmp5],    %[ftmp7]          \n\t" \
   "punpckhwd     %[ftmp2],    %[ftmp5],    %[ftmp7]          \n\t" \
   "punpcklwd     %[ftmp3],    %[ftmp6],    %[ftmp8]          \n\t" \
@@ -41,15 +41,19 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
                               int pred_stride, unsigned char *dst_ptr,
                               int dst_stride) {
   double ftmp[12];
-  uint32_t tmp[0];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };
+  uint64_t tmp[1];
+  double ff_ph_04, ff_ph_4e7b, ff_ph_22a3;
 
   __asm__ volatile (
+    "dli        %[tmp0],    0x0004000400040004                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_04]                         \n\t"
+    "dli        %[tmp0],    0x4e7b4e7b4e7b4e7b                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_4e7b]                       \n\t"
+    "dli        %[tmp0],    0x22a322a322a322a3                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_22a3]                       \n\t"
     MMI_LI(%[tmp0], 0x02)
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
 
     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
@@ -186,9 +190,10 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
       [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
       [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
       [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
-      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)
-    : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),
-      [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),
+      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr),
+      [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04),
+      [ff_ph_22a3]"=&f"(ff_ph_22a3)
+    : [ip]"r"(input),
       [pred_stride]"r"((mips_reg)pred_stride),
       [dst_stride]"r"((mips_reg)dst_stride)
     : "memory"
@@ -198,12 +203,13 @@ void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
 void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
                               int pred_stride, unsigned char *dst_ptr,
                               int dst_stride) {
-  int a1 = ((input_dc + 4) >> 3);
-  double ftmp[5];
+  int a0 = ((input_dc + 4) >> 3);
+  double a1, ftmp[5];
   int low32;
 
   __asm__ volatile (
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "dmtc1      %[a0],      %[a1]                           \n\t"
     "pshufh     %[a1],      %[a1],          %[ftmp0]        \n\t"
     "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
     "mtc1       %[low32],   %[ftmp1]                        \n\t"
@@ -244,9 +250,9 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
     "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
     : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
       [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
-      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)
+      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1)
     : [dst_stride]"r"((mips_reg)dst_stride),
-      [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)
+      [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0)
     : "memory"
   );
 }
@@ -254,14 +260,15 @@ void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
 void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
   int i;
   int16_t output[16];
-  double ftmp[12];
-  uint32_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };
+  double ff_ph_03, ftmp[12];
+  uint64_t tmp[1];
 
   __asm__ volatile (
+    "dli        %[tmp0],    0x0003000300030003                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_03]                         \n\t"
     MMI_LI(%[tmp0], 0x03)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
     "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
@@ -317,8 +324,8 @@ void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
       [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
       [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
       [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
-      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])
-    : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03)
+    : [ip]"r"(input), [op]"r"(output)
     : "memory"
   );
 
diff --git a/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
index f2182f95c9..a07a7e3b41 100644
--- a/vp8/common/mips/mmi/loopfilter_filters_mmi.c
+++ b/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -13,28 +13,25 @@
 #include "vp8/common/onyxc_int.h"
 #include "vpx_ports/asmdefs_mmi.h"
 
-DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_003f) = { 0x003f003f003f003fULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_0900) = { 0x0900090009000900ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_1200) = { 0x1200120012001200ULL };
-DECLARE_ALIGNED(8, static const uint64_t,
-                ff_ph_1b00) = { 0x1b001b001b001b00ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL };
-DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL };
-
 void vp8_loop_filter_horizontal_edge_mmi(
     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
     const unsigned char *limit, const unsigned char *thresh, int count) {
-  uint32_t tmp[1];
+  uint64_t tmp[1];
   mips_reg addr[2];
   double ftmp[12];
+  double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
     "1:                                                             \n\t"
     "gsldlc1    %[ftmp10],  0x07(%[limit])                          \n\t"
     "gsldrc1    %[ftmp10],  0x00(%[limit])                          \n\t"
@@ -56,14 +53,14 @@ void vp8_loop_filter_horizontal_edge_mmi(
     "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
     "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
 
     MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
     "gsldlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
     "gsldrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
     "pasubub    %[ftmp9],   %[ftmp4],           %[ftmp5]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
 
     "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
     "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
@@ -72,35 +69,35 @@ void vp8_loop_filter_horizontal_edge_mmi(
     "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
     "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
 
     MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
     "gsldlc1    %[ftmp8],   0x07(%[addr1])                          \n\t"
     "gsldrc1    %[ftmp8],   0x00(%[addr1])                          \n\t"
     "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
 
     MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
     "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
     "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
     "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
 
     "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
     "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
-    "and        %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
     "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp10]           \n\t"
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
     "gsldlc1    %[ftmp10],  0x07(%[blimit])                         \n\t"
     "gsldrc1    %[ftmp10],  0x00(%[blimit])                         \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],          %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ftmp10]           \n\t"
     "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
 
     "gsldlc1    %[ftmp10],  0x07(%[thresh])                         \n\t"
@@ -108,72 +105,72 @@ void vp8_loop_filter_horizontal_edge_mmi(
     "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
     "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
     "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
-    "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
     "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
     "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
-    "xor        %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
 
-    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
 
     "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
-    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
     "psubsb     %[ftmp3],   %[ftmp6],           %[ftmp5]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
-    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
 
     "paddsb     %[ftmp8],   %[ftmp2],           %[ff_pb_03]         \n\t"
     "paddsb     %[ftmp9],   %[ftmp2],           %[ff_pb_04]         \n\t"
 
-    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
-    "xor        %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
     "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
     "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp8]            \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
     "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
     "packsshb   %[ftmp8],   %[ftmp0],           %[ftmp11]           \n\t"
-    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
     "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
-    "xor        %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "pxor       %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
     "punpckhbh  %[ftmp9],   %[ftmp11],          %[ftmp9]            \n\t"
     "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
     "paddsh     %[ftmp11],  %[ftmp0],           %[ff_ph_01]         \n\t"
     "packsshb   %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
     "paddsh     %[ftmp9],   %[ftmp9],           %[ff_ph_01]         \n\t"
 
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
     "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
     "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
     "packsshb   %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t"
     "pandn      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
     "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
-    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
 
     MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
     "gssdlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
     "gssdrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
     MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
     "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
-    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
     "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
     "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
 
     "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
     "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
     "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
 
     "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
     "gssdlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
     "gssdrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
 
@@ -188,17 +185,18 @@ void vp8_loop_filter_horizontal_edge_mmi(
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_fe]"=&f"(ff_pb_fe),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_03]"=&f"(ff_pb_03)
     : [limit]"r"(limit),                [blimit]"r"(blimit),
       [thresh]"r"(thresh),
       [src_pixel_step]"r"((mips_reg)src_pixel_step),
       [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
-      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
-      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_fe]"f"(ff_pb_fe),
-      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_04]"f"(ff_pb_04),
-      [ff_pb_03]"f"(ff_pb_03)
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))
     : "memory"
   );
+  /* clang-format on */
 }
 
 void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
@@ -206,11 +204,23 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
                                        const unsigned char *blimit,
                                        const unsigned char *limit,
                                        const unsigned char *thresh, int count) {
-  uint32_t tmp[1];
+  uint64_t tmp[1];
   mips_reg addr[2];
   double ftmp[13];
+  double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
     MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
     MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
     MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
@@ -288,23 +298,23 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
     /* abs (q2-q1) */
     "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
     "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* ftmp3: abs(q1-q0) */
     "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
     "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp8]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* ftmp4: abs(p1-p0) */
     "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
     "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp8]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* abs (p2-p1) */
     "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
     "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* abs (p3-p2) */
     "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
     "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
 
     "gsldlc1    %[ftmp8],   0x07(%[blimit])                         \n\t"
     "gsldrc1    %[ftmp8],   0x00(%[blimit])                         \n\t"
@@ -314,14 +324,14 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
     "paddusb    %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
     /* abs (p1-q1) */
     "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
-    "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+    "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp1]                                \n\t"
     "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp1]            \n\t"
     "paddusb    %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-    "xor        %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
     /* ftmp0:mask */
     "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
 
@@ -331,41 +341,41 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
     /* ftmp3: abs(q1-q0)  ftmp4: abs(p1-p0) */
     "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp8]            \n\t"
     "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
-    "or         %[ftmp2],   %[ftmp4],           %[ftmp3]            \n\t"
+    "por        %[ftmp2],   %[ftmp4],           %[ftmp3]            \n\t"
     "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
     "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
     /* ftmp1:hev */
-    "xor        %[ftmp1],   %[ftmp2],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp1],   %[ftmp2],           %[ftmp1]            \n\t"
 
-    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
-    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
 
     "psubsb     %[ftmp2],   %[ftmp5],           %[ftmp10]           \n\t"
-    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
     "psubsb     %[ftmp3],   %[ftmp9],           %[ftmp6]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
     /* ftmp2:filter_value */
-    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
 
     "paddsb     %[ftmp11],  %[ftmp2],           %[ff_pb_04]         \n\t"
     "paddsb     %[ftmp12],  %[ftmp2],           %[ff_pb_03]         \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
-    "xor       %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
-    "xor       %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
+    "pxor      %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor      %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
     "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp12]           \n\t"
     "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp12]           \n\t"
     "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
     "psrah     %[ftmp8],    %[ftmp8],           %[ftmp7]            \n\t"
     "packsshb  %[ftmp12],   %[ftmp0],           %[ftmp8]            \n\t"
 
-    "xor       %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
-    "xor       %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "pxor      %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor      %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
     "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp11]           \n\t"
     "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp11]           \n\t"
     "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
@@ -373,22 +383,22 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
     "packsshb  %[ftmp11],   %[ftmp0],           %[ftmp8]            \n\t"
 
     "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp11]           \n\t"
-    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
     "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp12]           \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
     "paddsh     %[ftmp0],   %[ftmp0],           %[ff_ph_01]         \n\t"
     "paddsh     %[ftmp8],   %[ftmp8],           %[ff_ph_01]         \n\t"
 
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     "psrah      %[ftmp8],   %[ftmp8],           %[ftmp7]            \n\t"
     "packsshb   %[ftmp2],   %[ftmp0],           %[ftmp8]            \n\t"
     "pandn      %[ftmp2],   %[ftmp1],           %[ftmp2]            \n\t"
     "psubsb     %[ftmp10],  %[ftmp10],          %[ftmp2]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
     "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
-    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
 
     /* ftmp5: *op1 ; ftmp6: *op0 */
     "punpcklbh  %[ftmp2],   %[ftmp5],           %[ftmp6]            \n\t"
@@ -408,7 +418,7 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
 
     "li         %[tmp0],    0x20                                    \n\t"
     "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-    "dsrl       %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "ssrld      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
     MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
     MMI_SUBU(%[addr1], %[addr0], %[tmp0])
     "gsswlc1    %[ftmp2],   0x05(%[addr1])                          \n\t"
@@ -419,21 +429,21 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
     "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
     "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
 
-    "dsrl       %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
     MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
     "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
     "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
     "gsswlc1    %[ftmp1],   0x05(%[src_ptr])                        \n\t"
     "gsswrc1    %[ftmp1],   0x02(%[src_ptr])                        \n\t"
 
-    "dsrl       %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "ssrld      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
     "gsswlc1    %[ftmp1],   0x05(%[addr0])                          \n\t"
     "gsswrc1    %[ftmp1],   0x02(%[addr0])                          \n\t"
     MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
     "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
     "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
 
-    "dsrl       %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
+    "ssrld      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
     MMI_ADDU(%[addr1], %[addr0], %[tmp0])
     "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
     "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
@@ -450,15 +460,16 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_fe]"=&f"(ff_pb_fe)
     : [limit]"r"(limit),                [blimit]"r"(blimit),
       [thresh]"r"(thresh),
-      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_03]"f"(ff_pb_03),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_fe]"f"(ff_pb_fe)
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
     : "memory"
   );
+  /* clang-format on */
 }
 
 /* clang-format off */
@@ -484,10 +495,29 @@ void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
 void vp8_mbloop_filter_horizontal_edge_mmi(
     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
     const unsigned char *limit, const unsigned char *thresh, int count) {
-  uint32_t tmp[1];
+  uint64_t tmp[1];
   double ftmp[13];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,
+      ff_ph_1200, ff_ph_1b00;
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0x1200120012001200                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1200]                           \n\t"
+    "dli        %[tmp0],    0x1b001b001b001b00                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1b00]                           \n\t"
     MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
     MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
     "1:                                                             \n\t"
@@ -532,31 +562,31 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
     "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
     "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
     "pasubub    %[ftmp10],  %[ftmp4],           %[ftmp5]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
     "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp9]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
     "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
     "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
 
     "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
     "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
-    "and        %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
     "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
-    "xor        %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
     /* ftmp0: mask */
     "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
 
@@ -565,27 +595,27 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
     "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
     "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp9]            \n\t"
     "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
-    "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
     "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
     "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
     /* ftmp1: hev */
-    "xor        %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
 
-    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
     "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
     "psubsb     %[ftmp9],   %[ftmp6],           %[ftmp5]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
-    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
     "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
-    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_03]         \n\t"
     VP8_MBLOOP_HPSRAB
     "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
@@ -593,15 +623,15 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
     VP8_MBLOOP_HPSRAB
     "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
 
-    "li         %[tmp0],    0x07                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
 
     VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
     "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp1]            \n\t"
     "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
     MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
     MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
     "gssdlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
@@ -613,8 +643,8 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
     VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
     "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
     "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
-    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
     "gssdlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
     "gssdrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
@@ -624,12 +654,12 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
     "gssdrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
 
     VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
-    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
     "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp1]            \n\t"
     "psubsb     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
-    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
     MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
     "gssdlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
     "gssdrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
@@ -649,21 +679,23 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
       [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
-      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),          [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),          [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_ph_0900]"=&f"(ff_ph_0900),      [ff_ph_1b00]"=&f"(ff_ph_1b00),
+      [ff_ph_1200]"=&f"(ff_ph_1200),      [ff_ph_003f]"=&f"(ff_ph_003f)
     : [limit]"r"(limit),                  [blimit]"r"(blimit),
       [thresh]"r"(thresh),
-      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_pb_fe]"f"(ff_pb_fe),            [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),            [ff_pb_03]"f"(ff_pb_03),
-      [ff_ph_0900]"f"(ff_ph_0900),        [ff_ph_1b00]"f"(ff_ph_1b00),
-      [ff_ph_1200]"f"(ff_ph_1200),        [ff_ph_003f]"f"(ff_ph_003f)
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
     : "memory"
   );
+  /* clang-format on */
 }
 
+/* clang-format off */
 #define VP8_MBLOOP_VPSRAB_ADDH                                          \
-  "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t" \
-  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
+  "pxor       %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t" \
+  "pxor       %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
   "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp0]            \n\t"
 
@@ -673,15 +705,30 @@ void vp8_mbloop_filter_horizontal_edge_mmi(
   "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t" \
   "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t" \
   "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+/* clang-format on */
 
 void vp8_mbloop_filter_vertical_edge_mmi(
     unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
     const unsigned char *limit, const unsigned char *thresh, int count) {
   mips_reg tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
   double ftmp[14];
+  double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
     MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
 
     "1:                                                             \n\t"
@@ -755,23 +802,23 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     /* abs (q2-q1) */
     "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
     "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* ftmp3: abs(q1-q0) */
     "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
     "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp13]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* ftmp4: abs(p1-p0) */
     "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
     "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp13]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* abs (p2-p1) */
     "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
     "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     /* abs (p3-p2) */
     "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
     "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
 
     "gsldlc1    %[ftmp13],  0x07(%[blimit])                         \n\t"
     "gsldrc1    %[ftmp13],  0x00(%[blimit])                         \n\t"
@@ -782,14 +829,14 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
     /* abs (p1-q1) / 2 */
     "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
-    "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+    "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
     "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
     "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
     "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
-    "xor        %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
     /* ftmp0: mask */
     "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
 
@@ -797,19 +844,19 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp7]            \n\t"
     /* abs(q1-q0) - thresh */
     "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp7]            \n\t"
-    "or         %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
+    "por        %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
     "pcmpeqb    %[ftmp3],   %[ftmp3],           %[ftmp12]           \n\t"
     "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
     /* ftmp1: hev */
-    "xor        %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"
 
     /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
-    "xor        %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
-    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
-    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
 
     "psubsb     %[ftmp3],   %[ftmp5],           %[ftmp10]           \n\t"
     "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp6]            \n\t"
@@ -817,15 +864,15 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
     "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
     /* filter_value &= mask */
-    "and        %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+    "pand       %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
     /* Filter2 = filter_value & hev */
-    "and        %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
+    "pand       %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
     /* filter_value &= ~hev */
     "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"
 
     "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
     "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
     "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
     "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
@@ -842,8 +889,8 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     /* ftmp6: ps0 */
     "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
 
-    "li         %[tmp0],    0x07                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
     VP8_MBLOOP_VPSRAB_ADDH
     "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
     "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_0900]       \n\t"
@@ -852,10 +899,10 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     VP8_MBLOOP_VPSRAB_ADDT
     "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp3]            \n\t"
     /* ftmp9: oq0 */
-    "xor        %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
     "paddsb     %[ftmp4],   %[ftmp6],           %[ftmp3]            \n\t"
     /* ftmp6: op0 */
-    "xor        %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"
 
     VP8_MBLOOP_VPSRAB_ADDH
     "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
@@ -864,10 +911,10 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     VP8_MBLOOP_VPSRAB_ADDT
     "psubsb     %[ftmp4],   %[ftmp10],          %[ftmp3]            \n\t"
     /* ftmp10: oq1 */
-    "xor        %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
     "paddsb     %[ftmp4],   %[ftmp5],           %[ftmp3]            \n\t"
     /* ftmp5: op1 */
-    "xor        %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"
 
     VP8_MBLOOP_VPSRAB_ADDH
     "pmulhh     %[ftmp7],   %[ftmp7],           %[ff_ph_0900]       \n\t"
@@ -875,10 +922,10 @@ void vp8_mbloop_filter_vertical_edge_mmi(
     VP8_MBLOOP_VPSRAB_ADDT
     "psubsb     %[ftmp4],   %[ftmp11],          %[ftmp3]            \n\t"
     /* ftmp11: oq2 */
-    "xor        %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
     "paddsb     %[ftmp4],   %[ftmp2],           %[ftmp3]            \n\t"
     /* ftmp2: op2 */
-    "xor        %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"
 
     "ldc1       %[ftmp12],  0x00(%[srct])                           \n\t"
     "ldc1       %[ftmp8],   0x08(%[srct])                           \n\t"
@@ -948,41 +995,58 @@ void vp8_mbloop_filter_vertical_edge_mmi(
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
       [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
-      [count]"+&r"(count)
+      [count]"+&r"(count),
+      [ff_ph_003f]"=&f"(ff_ph_003f),    [ff_ph_0900]"=&f"(ff_ph_0900),
+      [ff_pb_03]"=&f"(ff_pb_03),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_fe]"=&f"(ff_pb_fe)
     : [limit]"r"(limit),                [blimit]"r"(blimit),
       [srct]"r"(srct),                  [thresh]"r"(thresh),
-      [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [ff_ph_003f]"f"(ff_ph_003f),      [ff_ph_0900]"f"(ff_ph_0900),
-      [ff_pb_03]"f"(ff_pb_03),          [ff_pb_04]"f"(ff_pb_04),
-      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_fe]"f"(ff_pb_fe)
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
     : "memory"
   );
+  /* clang-format on */
 }
 
+/* clang-format off */
 #define VP8_SIMPLE_HPSRAB                                               \
   "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t" \
   "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t" \
   "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t" \
   "psrah      %[ftmp1],   %[ftmp5],           %[ftmp10]           \n\t" \
   "psllh      %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t" \
-  "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+  "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+/* clang-format on */
 
 void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
                                                 int src_pixel_step,
                                                 const unsigned char *blimit) {
-  uint32_t tmp[1], count = 2;
+  uint64_t tmp[1], count = 2;
   mips_reg addr[2];
   double ftmp[12];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
 
+  /* clang-format off */
   __asm__ volatile (
-    "li         %[tmp0],    0x08                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-    "li         %[tmp0],    0x03                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
 
     "1:                                                             \n\t"
     "gsldlc1    %[ftmp3],   0x07(%[blimit])                         \n\t"
@@ -996,7 +1060,7 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
     "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
     "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
     "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp2]            \n\t"
-    "and        %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
+    "pand       %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
     "psrlh      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
 
     MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
@@ -1008,31 +1072,31 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
     "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
     "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
     "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
-    "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t"
     "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
 
-    "xor        %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
     "psubsb     %[ftmp2],   %[ftmp2],           %[ftmp7]            \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp3],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp0],           %[ff_pb_80]         \n\t"
     "psubsb     %[ftmp0],   %[ftmp3],           %[ftmp6]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
     "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
-    "and        %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+    "pand       %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
 
     "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
     VP8_SIMPLE_HPSRAB
     "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
-    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
     "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
     "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
 
     "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
     VP8_SIMPLE_HPSRAB
     "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
     MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
     "gssdlc1    %[ftmp6],   0x07(%[addr1])                          \n\t"
     "gssdrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
@@ -1048,30 +1112,43 @@ void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
     : [blimit]"r"(blimit),
       [src_pixel_step]"r"((mips_reg)src_pixel_step),
-      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))
     : "memory"
   );
+  /* clang-format on */
 }
 
 void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
                                               int src_pixel_step,
                                               const unsigned char *blimit) {
-  uint32_t tmp[1], count = 2;
+  uint64_t tmp[1], count = 2;
   mips_reg addr[2];
-  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
-  double ftmp[12];
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+  double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
 
+  /* clang-format off */
   __asm__ volatile (
-    "li         %[tmp0],    0x08                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-    "li         %[tmp0],    0x20                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
-
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
     MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
     MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
 
@@ -1118,10 +1195,10 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
     "punpckhwd  %[ftmp3],   %[ftmp2],           %[ftmp5]            \n\t"
     "punpcklwd  %[ftmp2],   %[ftmp2],           %[ftmp5]            \n\t"
 
-    "li         %[tmp0],    0x01                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "pasubub    %[ftmp6],   %[ftmp3],           %[ftmp0]            \n\t"
-    "and        %[ftmp6],   %[ftmp6],           %[ff_pb_fe]         \n\t"
+    "pand       %[ftmp6],   %[ftmp6],           %[ff_pb_fe]         \n\t"
     "psrlh      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
     "pasubub    %[ftmp5],   %[ftmp1],           %[ftmp2]            \n\t"
     "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
@@ -1130,53 +1207,53 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
     "gsldlc1    %[ftmp7],   0x07(%[blimit])                         \n\t"
     "gsldrc1    %[ftmp7],   0x00(%[blimit])                         \n\t"
     "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
-    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
     "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
 
     "sdc1       %[ftmp0],   0x00(%[srct])                           \n\t"
     "sdc1       %[ftmp3],   0x08(%[srct])                           \n\t"
 
-    "xor        %[ftmp0],   %[ftmp0],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
     "psubsb     %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
 
-    "xor        %[ftmp6],   %[ftmp1],           %[ff_pb_80]         \n\t"
-    "xor        %[ftmp3],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp1],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp2],           %[ff_pb_80]         \n\t"
     "psubsb     %[ftmp7],   %[ftmp3],           %[ftmp6]            \n\t"
     "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
-    "and        %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
+    "pand       %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
     "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
 
-    "li         %[tmp0],    0x03                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
     "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psrah      %[ftmp7],   %[ftmp5],           %[ftmp9]            \n\t"
     "psllh      %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
     "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
-    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
     "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
 
-    "li         %[tmp0],    0x03                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
     "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
     "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
 
-    "li         %[tmp0],    0x0b                                    \n\t"
-    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
     "psrah      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
     "psllh      %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
-    "or         %[ftmp0],   %[ftmp0],           %[ftmp5]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp5]            \n\t"
     "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
-    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
 
     "ldc1       %[ftmp0],   0x00(%[srct])                           \n\t"
     "ldc1       %[ftmp4],   0x08(%[srct])                           \n\t"
@@ -1195,7 +1272,7 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
     "punpckhhw  %[ftmp5],   %[ftmp1],           %[ftmp3]            \n\t"
     "punpcklhw  %[ftmp1],   %[ftmp1],           %[ftmp3]            \n\t"
 
-    "dsrl       %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    "ssrld      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
     MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
     "gsswlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
     "gsswrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
@@ -1203,7 +1280,7 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
     "gsswlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
     "gsswrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
 
-    "dsrl       %[ftmp6],   %[ftmp6],           %[ftmp10]           \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],           %[ftmp10]           \n\t"
     "gsswlc1    %[ftmp1],   0x03(%[src_ptr])                        \n\t"
     "gsswrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
 
@@ -1215,11 +1292,11 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
     "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
     "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
 
-    "dsrl       %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "ssrld      %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
     "gsswlc1    %[ftmp1],   0x03(%[addr0])                          \n\t"
     "gsswrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
 
-    "dsrl       %[ftmp5],   %[ftmp5],           %[ftmp10]           \n\t"
+    "ssrld      %[ftmp5],   %[ftmp5],           %[ftmp10]           \n\t"
     MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
     "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
     "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
@@ -1235,16 +1312,17 @@ void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
       [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
     : [blimit]"r"(blimit),              [srct]"r"(srct),
       [src_pixel_step]"r"((mips_reg)src_pixel_step),
       [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
       [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
-      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)),
-      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
-      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))
     : "memory"
   );
+  /* clang-format on */
 }
 
 /* Horizontal MB filtering */
diff --git a/vp8/common/mips/mmi/sixtap_filter_mmi.c b/vp8/common/mips/mmi/sixtap_filter_mmi.c
index 77d665d456..b85f73fdff 100644
--- a/vp8/common/mips/mmi/sixtap_filter_mmi.c
+++ b/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -70,9 +70,8 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
                                              unsigned int output_height,
                                              unsigned int output_width,
                                              const int16_t *vp8_filter) {
-  uint32_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-
+  uint64_t tmp[1];
+  double ff_ph_40;
 #if _MIPS_SIM == _ABIO32
   register double fzero asm("$f0");
   register double ftmp0 asm("$f2");
@@ -103,18 +102,21 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
   register double ftmp11 asm("$f12");
 #endif  // _MIPS_SIM == _ABIO32
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],        0x0040004000400040                    \n\t"
+    "dmtc1      %[tmp0],        %[ff_ph_40]                           \n\t"
     "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"
     "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"
     "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"
     "ldc1       %[ftmp3],       0x30(%[vp8_filter])                   \n\t"
     "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"
     "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"
-    "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
-    "li         %[tmp0],        0x07                                  \n\t"
-    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
-    "li         %[tmp0],        0x08                                  \n\t"
-    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"
+    "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
+    "dli        %[tmp0],        0x07                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp7]                              \n\t"
+    "dli        %[tmp0],        0x08                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp11]                             \n\t"
 
     "1:                                                               \n\t"
     "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
@@ -137,12 +139,12 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
 
-    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "ssrld      %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
     "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
 
-    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "ssrld      %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
     "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
@@ -166,21 +168,22 @@ static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
       [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
       [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
-      [src_ptr]"+&r"(src_ptr)
+      [src_ptr]"+&r"(src_ptr),          [ff_ph_40]"=&f"(ff_ph_40)
     : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
-      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
-      [ff_ph_40]"f"(ff_ph_40)
+      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width)
     : "memory"
     );
+  /* clang-format on */
 }
 
 /* Horizontal filter:  pixel_step is always W */
 static INLINE void vp8_filter_block1dc_v6_mmi(
     uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
     int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  uint32_t tmp[1];
+  double ff_ph_40;
+  uint64_t tmp[1];
   mips_reg addr[1];
+
 #if _MIPS_SIM == _ABIO32
   register double fzero asm("$f0");
   register double ftmp0 asm("$f2");
@@ -215,16 +218,19 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
   register double ftmp13 asm("$f14");
 #endif  // _MIPS_SIM == _ABIO32
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],      0x0040004000400040                      \n\t"
+    "dmtc1      %[tmp0],      %[ff_ph_40]                             \n\t"
     "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"
     "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"
     "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"
     "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
     "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
     "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
-    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
-    "li         %[tmp0],      0x07                                    \n\t"
-    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"
+    "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
+    "dli        %[tmp0],      0x07                                    \n\t"
+    "dmtc1      %[tmp0],      %[ftmp13]                               \n\t"
 
     /* In order to make full use of memory load delay slot,
      * Operation of memory loading and calculating has been rearranged.
@@ -285,15 +291,16 @@ static INLINE void vp8_filter_block1dc_v6_mmi(
       [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
       [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
       [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
-      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [ff_ph_40]"=&f"(ff_ph_40)
     : [pixels_per_line]"r"((mips_reg)pixels_per_line),
       [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
       [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
       [vp8_filter]"r"(vp8_filter),
-      [output_pitch]"r"((mips_reg)output_pitch),
-      [ff_ph_40]"f"(ff_ph_40)
+      [output_pitch]"r"((mips_reg)output_pitch)
     : "memory"
     );
+  /* clang-format on */
 }
 
 /* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
@@ -313,8 +320,9 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
   register double ftmp1 asm("$f2");
 #endif  // _MIPS_SIM == _ABIO32
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
+    "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
 
     "1:                                                               \n\t"
     "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
@@ -335,6 +343,7 @@ static INLINE void vp8_filter_block1d_h6_filter0_mmi(
       [output_width]"r"(output_width)
     : "memory"
     );
+  /* clang-format on */
 }
 
 static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
@@ -350,8 +359,9 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
   register double ftmp1 asm("$f2");
 #endif  // _MIPS_SIM == _ABIO32
 
+  /* clang-format on */
   __asm__ volatile (
-    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
+    "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
 
     "1:                                                               \n\t"
     "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
@@ -371,6 +381,7 @@ static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
       [output_pitch]"r"((mips_reg)output_pitch)
     : "memory"
     );
+  /* clang-format on */
 }
 
 #define sixtapNxM(n, m)                                                        \
diff --git a/vp8/common/mips/msa/sixtap_filter_msa.c b/vp8/common/mips/msa/sixtap_filter_msa.c
index b0affcff01..3a1bb7cd57 100644
--- a/vp8/common/mips/msa/sixtap_filter_msa.c
+++ b/vp8/common/mips/msa/sixtap_filter_msa.c
@@ -35,101 +35,134 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
                         filt_h2)                                           \
   ({                                                                       \
-    v16i8 vec0_m, vec1_m, vec2_m;                                          \
-    v8i16 hz_out_m;                                                        \
+    v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m;                        \
+    v8i16 _6tap_out_m;                                                     \
                                                                            \
     VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,    \
-               vec0_m, vec1_m, vec2_m);                                    \
-    hz_out_m =                                                             \
-        DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);   \
+               _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m);                  \
+    _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m,   \
+                               filt_h0, filt_h1, filt_h2);                 \
                                                                            \
-    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);                  \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                 \
+    _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT);            \
+    _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7);                           \
                                                                            \
-    hz_out_m;                                                              \
+    _6tap_out_m;                                                           \
   })
 
 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
                                    mask2, filt0, filt1, filt2, out0, out1) \
   {                                                                        \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                  \
+    v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m,         \
+        _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m;           \
                                                                            \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);                 \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);                \
-    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
-    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);                \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m,    \
+               _6tap_4wid_vec1_m);                                         \
+    DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0,  \
+                out1);                                                     \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m,    \
+               _6tap_4wid_vec3_m);                                         \
+    DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \
+                 out1);                                                    \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m,    \
+               _6tap_4wid_vec5_m);                                         \
+    DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \
+                 out1);                                                    \
   }
 
-#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   mask2, filt0, filt1, filt2, out0, out1,   \
-                                   out2, out3)                               \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                out0, out1, out2, out3);                                     \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
-    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);        \
-    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
-                 out0, out1, out2, out3);                                    \
-    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
-                 out0, out1, out2, out3);                                    \
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   mask2, filt0, filt1, filt2, out0, out1,  \
+                                   out2, out3)                              \
+  {                                                                         \
+    v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,          \
+        _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m,            \
+        _6tap_8wid_vec6_m, _6tap_8wid_vec7_m;                               \
+                                                                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m,     \
+               _6tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m,     \
+               _6tap_8wid_vec3_m);                                          \
+    DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,    \
+                _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
+                out2, out3);                                                \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m,     \
+               _6tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m,     \
+               _6tap_8wid_vec3_m);                                          \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m,     \
+               _6tap_8wid_vec5_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m,     \
+               _6tap_8wid_vec7_m);                                          \
+    DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,   \
+                 _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+                 out2, out3);                                               \
+    DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m,   \
+                 _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \
+                 out2, out3);                                               \
   }
 
-#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)        \
-  ({                                                         \
-    v8i16 tmp0;                                              \
-                                                             \
-    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);        \
-    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
-                                                             \
-    tmp0;                                                    \
-  })
-
-#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)   \
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)                 \
   ({                                                                  \
-    v16i8 vec0_m, vec1_m;                                             \
-    v8i16 hz_out_m;                                                   \
-                                                                      \
-    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
-    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
+    v8i16 _4tap_dpadd_tmp0;                                           \
                                                                       \
-    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);             \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                            \
+    _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);     \
+    _4tap_dpadd_tmp0 =                                                \
+        __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \
                                                                       \
-    hz_out_m;                                                         \
+    _4tap_dpadd_tmp0;                                                 \
   })
 
-#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
-                                   filt0, filt1, out0, out1)             \
-  {                                                                      \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
-                                                                         \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);    \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);               \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);    \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);              \
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)        \
+  ({                                                                       \
+    v16i8 _4tap_vec0_m, _4tap_vec1_m;                                      \
+    v8i16 _4tap_out_m;                                                     \
+                                                                           \
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m,         \
+               _4tap_vec1_m);                                              \
+    _4tap_out_m =                                                          \
+        FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \
+                                                                           \
+    _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT);            \
+    _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7);                           \
+                                                                           \
+    _4tap_out_m;                                                           \
+  })
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1)               \
+  {                                                                        \
+    v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m,         \
+        _4tap_4wid_vec3_m;                                                 \
+                                                                           \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m,    \
+               _4tap_4wid_vec1_m);                                         \
+    DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0,  \
+                out1);                                                     \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m,    \
+               _4tap_4wid_vec3_m);                                         \
+    DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \
+                 out1);                                                    \
   }
 
-#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   filt0, filt1, out0, out1, out2, out3)     \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                out0, out1, out2, out3);                                     \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
-    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
-                 out0, out1, out2, out3);                                    \
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   filt0, filt1, out0, out1, out2, out3)    \
+  {                                                                         \
+    v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,          \
+        _4tap_8wid_vec3_m;                                                  \
+                                                                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m,     \
+               _4tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m,     \
+               _4tap_8wid_vec3_m);                                          \
+    DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,    \
+                _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
+                out2, out3);                                                \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m,     \
+               _4tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m,     \
+               _4tap_8wid_vec3_m);                                          \
+    DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,   \
+                 _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+                 out2, out3);                                               \
   }
 
 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index 14f83799ff..cc85b9a1f7 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -40,158 +40,160 @@
 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile("lw  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile("lw  %[lw_val_m],  %[lw_psrc_m]  \n\t" \
+                                                        \
+                 : [lw_val_m] "=r"(lw_val_m)            \
+                 : [lw_psrc_m] "m"(*lw_psrc_m));        \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile("ld  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile("ld  %[ld_val_m],  %[ld_psrc_m]  \n\t" \
+                                                        \
+                 : [ld_val_m] "=r"(ld_val_m)            \
+                 : [ld_psrc_m] "m"(*ld_psrc_m));        \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m);                                    \
-    val1_m = LW(psrc_m + 4);                                \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc);           \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m);                                    \
+    ld_val1_m = LW(ld_psrc_m + 4);                                \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint16_t val_m = (val);                 \
-                                                  \
-    asm volatile("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SH(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);             \
+    const uint16_t sh_val_m = (val);                    \
+                                                        \
+    asm volatile("sh  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                        \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)         \
+                 : [sh_val_m] "r"(sh_val_m));           \
   }
 
-#define SW(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint32_t val_m = (val);                 \
-                                                  \
-    asm volatile("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SW(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);             \
+    const uint32_t sw_val_m = (val);                    \
+                                                        \
+    asm volatile("sw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                        \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)         \
+                 : [sw_val_m] "r"(sw_val_m));           \
   }
 
-#define SD(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint64_t val_m = (val);                 \
-                                                  \
-    asm volatile("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SD(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sd_pdst_m = (uint8_t *)(pdst);             \
+    const uint64_t sd_val_m = (val);                    \
+                                                        \
+    asm volatile("sd  %[sd_val_m],  %[sd_pdst_m]  \n\t" \
+                                                        \
+                 : [sd_pdst_m] "=m"(*sd_pdst_m)         \
+                 : [sd_val_m] "r"(sd_val_m));           \
   }
 #else  // !(__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile("ulw  %[val_m],  %[psrc_m]  \n\t"   \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile(                                       \
+        "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t"         \
+        "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t"         \
+        : [lw_val_m] "=&r"(lw_val_m)                    \
+        : [lw_psrc_m] "r"(lw_psrc_m));                  \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile("uld  %[val_m],  %[psrc_m]  \n\t"   \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile(                                       \
+        "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t"         \
+        "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t"         \
+        : [ld_val_m] "=&r"(ld_val_m)                    \
+        : [ld_psrc_m] "r"(ld_psrc_m));                  \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m1);                                   \
-    val1_m = LW(psrc_m1 + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc);          \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m1);                                   \
+    ld_val1_m = LW(ld_psrc_m1 + 4);                               \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
-#define SH(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint16_t val_m = (val);                  \
-                                                   \
-    asm volatile("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SH(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);              \
+    const uint16_t sh_val_m = (val);                     \
+                                                         \
+    asm volatile("ush  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                         \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)          \
+                 : [sh_val_m] "r"(sh_val_m));            \
   }
 
-#define SW(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint32_t val_m = (val);                  \
-                                                   \
-    asm volatile("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SW(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);              \
+    const uint32_t sw_val_m = (val);                     \
+                                                         \
+    asm volatile("usw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                         \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)          \
+                 : [sw_val_m] "r"(sw_val_m));            \
   }
 
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
+#define SD(val, pdst)                                           \
+  {                                                             \
+    uint8_t *sd_pdst_m1 = (uint8_t *)(pdst);                    \
+    uint32_t sd_val0_m, sd_val1_m;                              \
+                                                                \
+    sd_val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                                \
+    SW(sd_val0_m, sd_pdst_m1);                                  \
+    SW(sd_val1_m, sd_pdst_m1 + 4);                              \
   }
 #endif  // (__mips_isa_rev >= 6)
 
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 05c72df3fa..1b70ea5dba 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -26,13 +26,6 @@ struct VP8_COMP;
 
 /* Create/destroy static data structures. */
 
-typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
 typedef enum {
   USAGE_LOCAL_FILE_PLAYBACK = 0x0,
   USAGE_STREAM_FROM_SERVER = 0x1,
@@ -58,19 +51,19 @@ typedef enum {
 #include <assert.h>
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   switch (mode) {
-    case NORMAL:
+    case VP8E_NORMAL:
       *hr = 1;
       *hs = 1;
       break;
-    case FOURFIVE:
+    case VP8E_FOURFIVE:
       *hr = 4;
       *hs = 5;
       break;
-    case THREEFIVE:
+    case VP8E_THREEFIVE:
       *hr = 3;
       *hs = 5;
       break;
-    case ONETWO:
+    case VP8E_ONETWO:
       *hr = 1;
       *hs = 2;
       break;
@@ -90,7 +83,14 @@ typedef struct {
   int Width;
   int Height;
   struct vpx_rational timebase;
-  unsigned int target_bandwidth; /* kilobits per second */
+  /* In either kilobits per second or bits per second, depending on which
+   * copy of oxcf this is in.
+   * - ctx->oxcf.target_bandwidth is in kilobits per second. See
+   *   set_vp8e_config().
+   * - ctx->cpi->oxcf.target_bandwidth in is bits per second. See
+   *   vp8_change_config().
+   */
+  unsigned int target_bandwidth;
 
   /* Parameter used for applying denoiser.
    * For temporal denoiser: noise_sensitivity = 0 means off,
@@ -221,6 +221,7 @@ typedef struct {
 
   /* Temporal scaling parameters */
   unsigned int number_of_layers;
+  /* kilobits per second */
   unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY];
   unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY];
   unsigned int periodicity;
@@ -243,11 +244,11 @@ typedef struct {
 
 void vp8_initialize();
 
-struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf);
+struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf);
 void vp8_remove_compressor(struct VP8_COMP **comp);
 
 void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
-void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf);
+void vp8_change_config(struct VP8_COMP *cpi, const VP8_CONFIG *oxcf);
 
 int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
@@ -273,8 +274,8 @@ int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
                    unsigned int threshold[4]);
 int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
                        unsigned int rows, unsigned int cols);
-int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
+int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);
 int vp8_get_quantizer(struct VP8_COMP *cpi);
 
 #ifdef __cplusplus
diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h
index e4e81aaac5..217a598de7 100644
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h
@@ -41,9 +41,8 @@ void vp8dx_set_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst, int x);
 
 int vp8dx_get_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst);
 
-int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi, int64_t time_stamp);
+int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi);
 int vp8dx_get_raw_frame(struct VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
-                        int64_t *time_stamp, int64_t *time_end_stamp,
                         vp8_ppflags_t *flags);
 int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
 
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 8452b5e854..12b474d939 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -38,25 +38,25 @@ ()
 specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi lsx/;
 
 #
 # Loopfilter
 #
 add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi lsx/;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
@@ -108,7 +108,7 @@ ()
 
 #idct1_scalar_add
 add_proto qw/void vp8_dc_only_idct_add/, "short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride";
-specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/;
+specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi lsx/;
 
 #
 # RECON
@@ -127,12 +127,6 @@ ()
 #
 if (vpx_config("CONFIG_POSTPROC") eq "yes") {
 
-    add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
-
-    add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
-
-    add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
-
     add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
     specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
 
@@ -146,16 +140,16 @@ ()
 # Subpixel
 #
 add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/;
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/;
+specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/;
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
@@ -184,10 +178,10 @@ ()
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/;
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi lsx/;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
 specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
@@ -196,7 +190,7 @@ ()
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi lsx/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
 specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
@@ -205,10 +199,10 @@ ()
 # Block subtraction
 #
 add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
-specialize qw/vp8_block_error sse2 msa/;
+specialize qw/vp8_block_error sse2 msa lsx/;
 
 add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
-specialize qw/vp8_mbblock_error sse2 msa/;
+specialize qw/vp8_mbblock_error sse2 msa lsx/;
 
 add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
 specialize qw/vp8_mbuverror sse2 msa/;
@@ -216,20 +210,16 @@ ()
 #
 # Motion search
 #
-add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_full_search_sad sse3 sse4_1/;
-$vp8_full_search_sad_sse3=vp8_full_search_sadx3;
-$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
-
 add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
 specialize qw/vp8_refining_search_sad sse2 msa/;
 $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
 $vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
 
 add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_diamond_search_sad sse2 msa/;
+specialize qw/vp8_diamond_search_sad sse2 msa lsx/;
 $vp8_diamond_search_sad_sse2=vp8_diamond_search_sadx4;
 $vp8_diamond_search_sad_msa=vp8_diamond_search_sadx4;
+$vp8_diamond_search_sad_lsx=vp8_diamond_search_sadx4;
 
 #
 # Alt-ref Noise Reduction (ARNR)
diff --git a/vp8/common/threading.h b/vp8/common/threading.h
index f921369386..1cfb9fec51 100644
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -171,17 +171,20 @@ static inline int sem_destroy(sem_t *sem) {
 #define sem_wait(sem) (semaphore_wait(*sem))
 #define sem_post(sem) semaphore_signal(*sem)
 #define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
-#define thread_sleep(nms)
-/* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec =
-   1000*nms;nanosleep(&ts, NULL);} */
 #else
 #include <unistd.h>
 #include <sched.h>
-#define thread_sleep(nms) sched_yield();
+#endif /* __APPLE__ */
+/* Not Windows. Assume pthreads */
+
+/* thread_sleep implementation: yield unless Linux/Unix. */
+#if defined(__unix__) || defined(__APPLE__)
+#define thread_sleep(nms)
 /* {struct timespec ts;ts.tv_sec=0;
     ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
-#endif
-/* Not Windows. Assume pthreads */
+#else
+#define thread_sleep(nms) sched_yield();
+#endif /* __unix__ || __APPLE__ */
 
 #endif
 
diff --git a/vp8/common/vp8_loopfilter.c b/vp8/common/vp8_loopfilter.c
index 9c9e5f351b..4576c18537 100644
--- a/vp8/common/vp8_loopfilter.c
+++ b/vp8/common/vp8_loopfilter.c
@@ -111,7 +111,7 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
 
     /* Note the baseline filter values for each segment */
     if (mbd->segmentation_enabled) {
-      if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+      if (mbd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
         lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
       } else { /* Delta Value */
         lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
diff --git a/vp8/common/x86/bilinear_filter_sse2.c b/vp8/common/x86/bilinear_filter_sse2.c
index 9bf65d8045..ff6cbbd68c 100644
--- a/vp8/common/x86/bilinear_filter_sse2.c
+++ b/vp8/common/x86/bilinear_filter_sse2.c
@@ -313,10 +313,10 @@ static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
       const __m128i compensated = _mm_add_epi16(sum, round_factor);
       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
       __m128i packed = _mm_packus_epi16(shifted, shifted);
-      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      storeu_int32(dst, _mm_cvtsi128_si32(packed));
       packed = _mm_srli_si128(packed, 4);
       dst += stride;
-      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      storeu_int32(dst, _mm_cvtsi128_si32(packed));
       dst += stride;
       src += 8;
     }
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index f2a18f0d90..673b2fbd5d 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -15,6 +15,7 @@
 #include <limits.h>
 
 #include "./vpx_config.h"
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
@@ -50,7 +51,8 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source,
 
 void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
 
-static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+static VPX_NO_UNSIGNED_SHIFT_CHECK int vp8dx_decode_bool(BOOL_DECODER *br,
+                                                         int probability) {
   unsigned int bit = 0;
   VP8_BD_VALUE value;
   unsigned int split;
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index 67c254fa14..af9a98c1de 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -63,7 +63,7 @@ void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   /* Decide whether to use the default or alternate baseline Q value. */
   if (xd->segmentation_enabled) {
     /* Abs Value */
-    if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
       QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
 
       /* Delta Value */
@@ -829,7 +829,7 @@ static void init_frame(VP8D_COMP *pbi) {
 
     /* reset the segment feature data to 0 with delta coding (Default state). */
     memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
-    xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
+    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
 
     /* reset the mode ref deltasa for loop filter */
     memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
@@ -872,8 +872,8 @@ static void init_frame(VP8D_COMP *pbi) {
   xd->mode_info_stride = pc->mode_info_stride;
   xd->corrupted = 0; /* init without corruption */
 
-  xd->fullpixel_mask = 0xffffffff;
-  if (pc->full_pixel) xd->fullpixel_mask = 0xfffffff8;
+  xd->fullpixel_mask = ~0;
+  if (pc->full_pixel) xd->fullpixel_mask = ~7;
 }
 
 int vp8_decode_frame(VP8D_COMP *pbi) {
@@ -995,7 +995,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
     xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
 
     if (xd->update_mb_segmentation_data) {
-      xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
+      xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(bc);
 
       memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
 
@@ -1167,14 +1167,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1;
 #endif
 
-  if (0) {
-    FILE *z = fopen("decodestats.stt", "a");
-    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", pc->current_video_frame,
-            pc->frame_type, pc->refresh_golden_frame, pc->refresh_alt_ref_frame,
-            pc->refresh_last_frame, pc->base_qindex);
-    fclose(z);
-  }
-
   {
     pbi->independent_partitions = 1;
 
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 94373852dc..3f459d623f 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -173,7 +173,8 @@ const vp8_prob vp8_sub_mv_ref_prob3[8][VP8_SUBMVREFS - 1] = {
   { 208, 1, 1 }     /* SUBMVREF_LEFT_ABOVE_ZED  */
 };
 
-static const vp8_prob *get_sub_mv_ref_prob(const int left, const int above) {
+static const vp8_prob *get_sub_mv_ref_prob(const uint32_t left,
+                                           const uint32_t above) {
   int lez = (left == 0);
   int aez = (above == 0);
   int lea = (left == above);
@@ -371,9 +372,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi,
         tmp = cnt[CNT_NEAREST];
         cnt[CNT_NEAREST] = cnt[CNT_NEAR];
         cnt[CNT_NEAR] = tmp;
-        tmp = near_mvs[CNT_NEAREST].as_int;
+        tmp = (int)near_mvs[CNT_NEAREST].as_int;
         near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
-        near_mvs[CNT_NEAR].as_int = tmp;
+        near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
       }
 
       if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 765d2ec83e..2248345ba2 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -302,7 +302,7 @@ static int check_fragments_for_errors(VP8D_COMP *pbi) {
   return 1;
 }
 
-int vp8dx_receive_compressed_data(VP8D_COMP *pbi, int64_t time_stamp) {
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi) {
   VP8_COMMON *cm = &pbi->common;
   int retcode = -1;
 
@@ -368,14 +368,12 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, int64_t time_stamp) {
 #endif
 
   pbi->ready_for_new_data = 0;
-  pbi->last_time_stamp = time_stamp;
 
 decode_exit:
   vpx_clear_system_state();
   return retcode;
 }
 int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
-                        int64_t *time_stamp, int64_t *time_end_stamp,
                         vp8_ppflags_t *flags) {
   int ret = -1;
 
@@ -385,8 +383,6 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
   if (pbi->common.show_frame == 0) return ret;
 
   pbi->ready_for_new_data = 1;
-  *time_stamp = pbi->last_time_stamp;
-  *time_end_stamp = 0;
 
 #if CONFIG_POSTPROC
   ret = vp8_post_proc_frame(&pbi->common, sd, flags);
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index cf2c066d9b..1070849620 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -11,6 +11,8 @@
 #ifndef VPX_VP8_DECODER_ONYXD_INT_H_
 #define VPX_VP8_DECODER_ONYXD_INT_H_
 
+#include <assert.h>
+
 #include "vpx_config.h"
 #include "vp8/common/onyxd.h"
 #include "treereader.h"
@@ -97,7 +99,6 @@ typedef struct VP8D_COMP {
 /* end of threading data */
 #endif
 
-  int64_t last_time_stamp;
   int ready_for_new_data;
 
   vp8_prob prob_intra;
@@ -133,25 +134,6 @@ int vp8_decode_frame(VP8D_COMP *pbi);
 int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
 int vp8_remove_decoder_instances(struct frame_buffers *fb);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    (lval) = (expr);                                              \
-    if (!(lval))                                                  \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 561922de32..6ccb080cf9 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -10,7 +10,7 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
-#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
+#if !defined(_WIN32) && CONFIG_OS_SUPPORT == 1
 #include <unistd.h>
 #endif
 #include "onyxd_int.h"
@@ -30,11 +30,13 @@
 #include "error_concealment.h"
 #endif
 
-#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
-#define CALLOC_ARRAY_ALIGNED(p, n, algn)                            \
-  do {                                                              \
-    CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
-    memset((p), 0, (n) * sizeof(*(p)));                             \
+#define CALLOC_ARRAY(p, n) \
+  CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn)                       \
+  do {                                                         \
+    CHECK_MEM_ERROR(&pbi->common.error, (p),                   \
+                    vpx_memalign((algn), sizeof(*(p)) * (n))); \
+    memset((p), 0, (n) * sizeof(*(p)));                        \
   } while (0)
 
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
@@ -54,7 +56,7 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     mbd->dst = xd->dst;
 
     mbd->segmentation_enabled = xd->segmentation_enabled;
-    mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+    mbd->mb_segment_abs_delta = xd->mb_segment_abs_delta;
     memcpy(mbd->segment_feature_data, xd->segment_feature_data,
            sizeof(xd->segment_feature_data));
 
@@ -74,9 +76,9 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
     memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
 
-    mbd->fullpixel_mask = 0xffffffff;
+    mbd->fullpixel_mask = ~0;
 
-    if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
+    if (pc->full_pixel) mbd->fullpixel_mask = ~7;
   }
 
   for (i = 0; i < pc->mb_rows; ++i)
@@ -754,7 +756,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     uv_width = width >> 1;
 
     /* Allocate a vpx_atomic_int for each mb row. */
-    CHECK_MEM_ERROR(pbi->mt_current_mb_col,
+    CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col,
                     vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
     for (i = 0; i < pc->mb_rows; ++i)
       vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
@@ -762,7 +764,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     /* Allocate memory for above_row buffers. */
     CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (width + (VP8BORDERINPIXELS << 1))));
       vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1));
@@ -770,7 +772,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
       vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -778,7 +780,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
       vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -787,17 +789,17 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     /* Allocate memory for left_col buffers. */
     CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 16, 1));
 
     CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
 
     CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
   }
 }
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c
index 6fc60805f6..950c943343 100644
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -28,11 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);
   int16x8_t x0, x1, sz0, sz1, y0, y1;
   uint16x8_t eob0, eob1;
-#ifndef __aarch64__
+#if !VPX_ARCH_AARCH64
   uint16x4_t eob_d16;
   uint32x2_t eob_d32;
   uint32x4_t eob_q32;
-#endif  // __arch64__
+#endif  // !VPX_ARCH_AARCH64
 
   /* sign of z: z >> 15 */
   sz0 = vshrq_n_s16(z0, 15);
@@ -70,7 +70,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
 
   /* select the largest value */
   eob0 = vmaxq_u16(eob0, eob1);
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *d->eob = (int8_t)vmaxvq_u16(eob0);
 #else
   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
@@ -79,7 +79,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
   eob_d32 = vpmax_u32(eob_d32, eob_d32);
 
   vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 
   /* qcoeff = x */
   vst1q_s16(d->qcoeff, x0);
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 80cbb882fd..03691fc9d1 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -19,6 +19,7 @@
 #include <limits.h>
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/system_state.h"
 #include "bitstream.h"
 
@@ -117,7 +118,9 @@ static void write_split(vp8_writer *bc, int x) {
                   vp8_mbsplit_encodings + x);
 }
 
-void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
+void VPX_NO_UNSIGNED_SHIFT_CHECK vp8_pack_tokens(vp8_writer *w,
+                                                 const TOKENEXTRA *p,
+                                                 int xcount) {
   const TOKENEXTRA *stop = p + xcount;
   unsigned int split;
   int shift;
@@ -172,9 +175,8 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
         validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error);
 
         w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff;
-        lowvalue <<= offset;
         shift = count;
-        lowvalue &= 0xffffff;
+        lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
         count -= 8;
       }
 
@@ -223,9 +225,8 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
             validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error);
 
             w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff;
-            lowvalue <<= offset;
             shift = count;
-            lowvalue &= 0xffffff;
+            lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
             count -= 8;
           }
 
@@ -866,7 +867,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
   vp8_writer *const w = cpi->bc;
 #endif
-  int savings = 0;
 
   vpx_clear_system_state();
 
@@ -940,8 +940,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
             vp8_write_literal(w, newp, 8);
 #endif
-
-            savings += s;
           }
 
         } while (++t < ENTROPY_NODES);
@@ -1082,7 +1080,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
     if (xd->update_mb_segmentation_data) {
       signed char Data;
 
-      vp8_write_bit(bc, xd->mb_segement_abs_delta);
+      vp8_write_bit(bc, xd->mb_segment_abs_delta);
 
       /* For each segmentation feature (Quant and loop filter level) */
       for (i = 0; i < MB_LVL_MAX; ++i) {
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index 8cc61bdd44..a8c536b99c 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -94,9 +94,8 @@ static void vp8_encode_bool(BOOL_CODER *bc, int bit, int probability) {
     validate_buffer(bc->buffer + bc->pos, 1, bc->buffer_end, bc->error);
     bc->buffer[bc->pos++] = (lowvalue >> (24 - offset) & 0xff);
 
-    lowvalue <<= offset;
     shift = count;
-    lowvalue &= 0xffffff;
+    lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
     count -= 8;
   }
 
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index e54d1e9f4b..a666bca4d2 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -135,7 +135,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
     // When adopting aggressive denoiser, the adj_val for each pixel
     // could be at most 8 (this is current max adjustment of the map).
     // In SSE code, we calculate the sum of adj_val for
-    // the columns, so the sum could be upto 128(16 rows). However,
+    // the columns, so the sum could be up to 128(16 rows). However,
     // the range of the value is -128 ~ 127 in SSE code, that's why
     // we do this change in C code.
     // We don't do this for UV denoiser, since there are only 8 rows,
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 2b3d9564ce..82c48b13a7 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdio.h>
+#include <limits.h>
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
@@ -29,9 +31,9 @@
 #include "rdopt.h"
 #include "pickinter.h"
 #include "vp8/common/findnearmv.h"
-#include <stdio.h>
-#include <limits.h>
 #include "vp8/common/invtrans.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
 #include "bitstream.h"
@@ -69,10 +71,9 @@ static const unsigned char VP8_VAR_OFFS[16] = { 128, 128, 128, 128, 128, 128,
                                                 128, 128, 128, 128 };
 
 /* Original activity measure from Tim T's code. */
-static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) {
+static unsigned int tt_activity_measure(MACROBLOCK *x) {
   unsigned int act;
   unsigned int sse;
-  (void)cpi;
   /* TODO: This could also be done over smaller areas (8x8), but that would
    *  require extensive changes elsewhere, as lambda is assumed to be fixed
    *  over an entire MB in most of the code.
@@ -90,28 +91,21 @@ static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) {
   return act;
 }
 
-/* Stub for alternative experimental activity measures. */
-static unsigned int alt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x,
-                                         int use_dc_pred) {
-  return vp8_encode_intra(cpi, x, use_dc_pred);
-}
-
 /* Measure the activity of the current macroblock
  * What we measure here is TBD so abstracted to this function
  */
 #define ALT_ACT_MEASURE 1
-static unsigned int mb_activity_measure(VP8_COMP *cpi, MACROBLOCK *x,
-                                        int mb_row, int mb_col) {
+static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) {
   unsigned int mb_activity;
 
   if (ALT_ACT_MEASURE) {
     int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
-    /* Or use and alternative. */
-    mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
+    /* Or use an alternative. */
+    mb_activity = vp8_encode_intra(x, use_dc_pred);
   } else {
     /* Original activity measure from Tim T's code. */
-    mb_activity = tt_activity_measure(cpi, x);
+    mb_activity = tt_activity_measure(x);
   }
 
   if (mb_activity < VP8_ACTIVITY_AVG_MIN) mb_activity = VP8_ACTIVITY_AVG_MIN;
@@ -131,7 +125,7 @@ static void calc_av_activity(VP8_COMP *cpi, int64_t activity_sum) {
     unsigned int tmp;
 
     /* Create a list to sort to */
-    CHECK_MEM_ERROR(sortlist,
+    CHECK_MEM_ERROR(&cpi->common.error, sortlist,
                     vpx_calloc(sizeof(unsigned int), cpi->common.MBs));
 
     /* Copy map to sort list */
@@ -264,7 +258,7 @@ static void build_activity_map(VP8_COMP *cpi) {
       vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
       /* measure activity */
-      mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
+      mb_activity = mb_activity_measure(x, mb_row, mb_col);
 
       /* Keep frame sum */
       activity_sum += mb_activity;
@@ -343,8 +337,11 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
   const int nsync = cpi->mt_sync_range;
   vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync);
   const vpx_atomic_int *last_row_current_mb_col;
-  vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+  vpx_atomic_int *current_mb_col = NULL;
 
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
+    current_mb_col = &cpi->mt_current_mb_col[mb_row];
+  }
   if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) {
     last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
   } else {
@@ -450,13 +447,21 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
     x->active_ptr = cpi->active_map + map_index + mb_col;
 
     if (cm->frame_type == KEY_FRAME) {
-      *totalrate += vp8cx_encode_intra_macroblock(cpi, x, tp);
+      const int intra_rate_cost = vp8cx_encode_intra_macroblock(cpi, x, tp);
+      if (INT_MAX - *totalrate > intra_rate_cost)
+        *totalrate += intra_rate_cost;
+      else
+        *totalrate = INT_MAX;
 #ifdef MODE_STATS
       y_modes[xd->mbmi.mode]++;
 #endif
     } else {
-      *totalrate += vp8cx_encode_inter_macroblock(
+      const int inter_rate_cost = vp8cx_encode_inter_macroblock(
           cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
+      if (INT_MAX - *totalrate > inter_rate_cost)
+        *totalrate += inter_rate_cost;
+      else
+        *totalrate = INT_MAX;
 
 #ifdef MODE_STATS
       inter_y_modes[xd->mbmi.mode]++;
@@ -631,12 +636,13 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi) {
                              cpi->prob_last_coded, cpi->prob_gf_coded);
   }
 
-  xd->fullpixel_mask = 0xffffffff;
-  if (cm->full_pixel) xd->fullpixel_mask = 0xfffffff8;
+  xd->fullpixel_mask = ~0;
+  if (cm->full_pixel) xd->fullpixel_mask = ~7;
 
   vp8_zero(x->coef_counts);
   vp8_zero(x->ymode_count);
-  vp8_zero(x->uv_mode_count) x->prediction_error = 0;
+  vp8_zero(x->uv_mode_count);
+  x->prediction_error = 0;
   x->intra_error = 0;
   vp8_zero(x->count_mb_ref_frame_usage);
 }
@@ -754,6 +760,15 @@ void vp8_encode_frame(VP8_COMP *cpi) {
       vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
                                 cpi->encoding_thread_count);
 
+      if (cpi->mt_current_mb_col_size != cm->mb_rows) {
+        vpx_free(cpi->mt_current_mb_col);
+        cpi->mt_current_mb_col = NULL;
+        cpi->mt_current_mb_col_size = 0;
+        CHECK_MEM_ERROR(
+            &cpi->common.error, cpi->mt_current_mb_col,
+            vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+        cpi->mt_current_mb_col_size = cm->mb_rows;
+      }
       for (i = 0; i < cm->mb_rows; ++i)
         vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
 
@@ -763,12 +778,12 @@ void vp8_encode_frame(VP8_COMP *cpi) {
 
       for (mb_row = 0; mb_row < cm->mb_rows;
            mb_row += (cpi->encoding_thread_count + 1)) {
-        vp8_zero(cm->left_context)
+        vp8_zero(cm->left_context);
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-            tp = cpi->tok;
+        tp = cpi->tok;
 #else
-            tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+        tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
 #endif
 
         encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
@@ -855,10 +870,10 @@ void vp8_encode_frame(VP8_COMP *cpi) {
 
       /* for each macroblock row in image */
       for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-        vp8_zero(cm->left_context)
+        vp8_zero(cm->left_context);
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-            tp = cpi->tok;
+        tp = cpi->tok;
 #endif
 
         encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index f89e7cb1fa..7d448c0ea0 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -18,10 +18,9 @@
 #include "vp8/common/invtrans.h"
 #include "encodeintra.h"
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) {
+int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred) {
   int i;
   int intra_pred_var = 0;
-  (void)cpi;
 
   if (use_dc_pred) {
     x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;
diff --git a/vp8/encoder/encodeintra.h b/vp8/encoder/encodeintra.h
index 021dc5ed76..9a378abf49 100644
--- a/vp8/encoder/encodeintra.h
+++ b/vp8/encoder/encodeintra.h
@@ -16,7 +16,7 @@
 extern "C" {
 #endif
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
+int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred);
 void vp8_encode_intra16x16mby(MACROBLOCK *x);
 void vp8_encode_intra16x16mbuv(MACROBLOCK *x);
 void vp8_encode_intra4x4mby(MACROBLOCK *mb);
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index 04adf105b9..384bb29389 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -31,17 +31,15 @@ static void encode_mvcomponent(vp8_writer *const w, const int v,
 
     vp8_write(w, 1, p[mvpis_short]);
 
-    do
+    do {
       vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
-    while (++i < 3);
+    } while (++i < 3);
 
     i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
 
-    do
+    do {
       vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
-    while (--i > 3);
+    } while (--i > 3);
 
     if (x & 0xFFF0) vp8_write(w, (x >> 3) & 1, p[MVPbits + 3]);
   }
@@ -160,7 +158,7 @@ static void calc_prob(vp8_prob *p, const unsigned int ct[2]) {
   const unsigned int tot = ct[0] + ct[1];
 
   if (tot) {
-    const vp8_prob x = ((ct[0] * 255) / tot) & -2;
+    const vp8_prob x = ((ct[0] * 255) / tot) & ~1u;
     *p = x ? x : 1;
   }
 }
@@ -205,8 +203,11 @@ static void write_component_probs(vp8_writer *const w,
   (void)rc;
   vp8_copy_array(Pnew, default_mvc, MVPcount);
 
-  vp8_zero(is_short_ct) vp8_zero(sign_ct) vp8_zero(bit_ct) vp8_zero(short_ct)
-      vp8_zero(short_bct)
+  vp8_zero(is_short_ct);
+  vp8_zero(sign_ct);
+  vp8_zero(bit_ct);
+  vp8_zero(short_ct);
+  vp8_zero(short_bct);
 
   /* j=0 */
   {
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 55a1528b14..e2f8b89d46 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stddef.h>
 
 #include "onyx_int.h"
 #include "vp8/common/threading.h"
@@ -402,7 +403,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) {
     zd->subpixel_predict8x8 = xd->subpixel_predict8x8;
     zd->subpixel_predict16x16 = xd->subpixel_predict16x16;
     zd->segmentation_enabled = xd->segmentation_enabled;
-    zd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+    zd->mb_segment_abs_delta = xd->mb_segment_abs_delta;
     memcpy(zd->segment_feature_data, xd->segment_feature_data,
            sizeof(xd->segment_feature_data));
 
@@ -470,8 +471,8 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 
     setup_mbby_copy(&mbr_ei[i].mb, x);
 
-    mbd->fullpixel_mask = 0xffffffff;
-    if (cm->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
+    mbd->fullpixel_mask = ~0;
+    if (cm->full_pixel) mbd->fullpixel_mask = ~7;
 
     vp8_zero(mb->coef_counts);
     vp8_zero(x->ymode_count);
@@ -487,15 +488,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 
 int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
   const VP8_COMMON *cm = &cpi->common;
-
-  vpx_atomic_init(&cpi->b_multi_threaded, 0);
-  cpi->encoding_thread_count = 0;
-  cpi->b_lpf_running = 0;
+  int th_count = 0;
 
   if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) {
-    int ithread;
-    int th_count = cpi->oxcf.multi_threaded - 1;
-    int rc = 0;
+    th_count = cpi->oxcf.multi_threaded - 1;
 
     /* don't allocate more threads than cores available */
     if (cpi->oxcf.multi_threaded > cm->processor_core_count) {
@@ -507,19 +503,24 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     if (th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1)) {
       th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
     }
+  }
+  if (th_count == cpi->encoding_thread_count) return 0;
 
-    if (th_count == 0) return 0;
+  vp8cx_remove_encoder_threads(cpi);
+  if (th_count != 0) {
+    int ithread;
+    int rc = 0;
 
-    CHECK_MEM_ERROR(cpi->h_encoding_thread,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
                     vpx_malloc(sizeof(pthread_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_start_encoding,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
                     vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_end_encoding,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
                     vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->mb_row_ei,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
                     vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
     memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
-    CHECK_MEM_ERROR(cpi->en_thread_data,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data,
                     vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
 
     vpx_atomic_store_release(&cpi->b_multi_threaded, 1);
@@ -553,6 +554,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       /* shutdown other threads */
       vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
       for (--ithread; ithread >= 0; ithread--) {
+        sem_post(&cpi->h_event_start_encoding[ithread]);
+        sem_post(&cpi->h_event_end_encoding[ithread]);
         pthread_join(cpi->h_encoding_thread[ithread], 0);
         sem_destroy(&cpi->h_event_start_encoding[ithread]);
         sem_destroy(&cpi->h_event_end_encoding[ithread]);
@@ -560,10 +563,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
       /* free thread related resources */
       vpx_free(cpi->h_event_start_encoding);
+      cpi->h_event_start_encoding = NULL;
       vpx_free(cpi->h_event_end_encoding);
+      cpi->h_event_end_encoding = NULL;
       vpx_free(cpi->h_encoding_thread);
+      cpi->h_encoding_thread = NULL;
       vpx_free(cpi->mb_row_ei);
+      cpi->mb_row_ei = NULL;
       vpx_free(cpi->en_thread_data);
+      cpi->en_thread_data = NULL;
+      cpi->encoding_thread_count = 0;
 
       return -1;
     }
@@ -592,10 +601,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
         /* free thread related resources */
         vpx_free(cpi->h_event_start_encoding);
+        cpi->h_event_start_encoding = NULL;
         vpx_free(cpi->h_event_end_encoding);
+        cpi->h_event_end_encoding = NULL;
         vpx_free(cpi->h_encoding_thread);
+        cpi->h_encoding_thread = NULL;
         vpx_free(cpi->mb_row_ei);
+        cpi->mb_row_ei = NULL;
         vpx_free(cpi->en_thread_data);
+        cpi->en_thread_data = NULL;
+        cpi->encoding_thread_count = 0;
 
         return -2;
       }
@@ -627,13 +642,23 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
 
     sem_destroy(&cpi->h_event_end_lpf);
     sem_destroy(&cpi->h_event_start_lpf);
+    cpi->b_lpf_running = 0;
 
     /* free thread related resources */
+    vpx_free(cpi->mt_current_mb_col);
+    cpi->mt_current_mb_col = NULL;
+    cpi->mt_current_mb_col_size = 0;
     vpx_free(cpi->h_event_start_encoding);
+    cpi->h_event_start_encoding = NULL;
     vpx_free(cpi->h_event_end_encoding);
+    cpi->h_event_end_encoding = NULL;
     vpx_free(cpi->h_encoding_thread);
+    cpi->h_encoding_thread = NULL;
     vpx_free(cpi->mb_row_ei);
+    cpi->mb_row_ei = NULL;
     vpx_free(cpi->en_thread_data);
+    cpi->en_thread_data = NULL;
+    cpi->encoding_thread_count = 0;
   }
 }
 #endif
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 981c0fde35..4443f5e7cd 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -412,7 +412,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
   int_mv ref_mv_full;
 
   int tmp_err;
-  int step_param = 3; /* Dont search over full range for first pass */
+  int step_param = 3; /* Don't search over full range for first pass */
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   int n;
   vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
@@ -567,7 +567,7 @@ void vp8_first_pass(VP8_COMP *cpi) {
       vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
       /* do intra 16x16 prediction */
-      this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+      this_error = vp8_encode_intra(x, use_dc_pred);
 
       /* "intrapenalty" below deals with situations where the intra
        * and inter error scores are very low (eg a plain black frame)
@@ -821,22 +821,6 @@ void vp8_first_pass(VP8_COMP *cpi) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
   }
 
-  /* use this to see what the first pass reconstruction looks like */
-  if (0) {
-    char filename[512];
-    FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int)cm->current_video_frame);
-
-    if (cm->current_video_frame == 0) {
-      recon_file = fopen(filename, "wb");
-    } else {
-      recon_file = fopen(filename, "ab");
-    }
-
-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
-
   cm->current_video_frame++;
 }
 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
@@ -903,9 +887,9 @@ static double calc_correction_factor(double err_per_mb, double err_devisor,
   correction_factor = pow(error_term, power_term);
 
   /* Clip range */
-  correction_factor = (correction_factor < 0.05)
-                          ? 0.05
-                          : (correction_factor > 5.0) ? 5.0 : correction_factor;
+  correction_factor = (correction_factor < 0.05)  ? 0.05
+                      : (correction_factor > 5.0) ? 5.0
+                                                  : correction_factor;
 
   return correction_factor;
 }
@@ -947,11 +931,10 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
     }
 
     cpi->twopass.est_max_qcorrection_factor =
-        (cpi->twopass.est_max_qcorrection_factor < 0.1)
-            ? 0.1
-            : (cpi->twopass.est_max_qcorrection_factor > 10.0)
-                  ? 10.0
-                  : cpi->twopass.est_max_qcorrection_factor;
+        (cpi->twopass.est_max_qcorrection_factor < 0.1) ? 0.1
+        : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+            ? 10.0
+            : cpi->twopass.est_max_qcorrection_factor;
   }
 
   /* Corrections for higher compression speed settings
@@ -1039,12 +1022,6 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
   double clip_iifactor;
   int overhead_bits_per_mb;
 
-  if (0) {
-    FILE *f = fopen("epmp.stt", "a");
-    fprintf(f, "%10.2f\n", err_per_mb);
-    fclose(f);
-  }
-
   target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
                                 ? (512 * section_target_bandwitdh) / num_mbs
                                 : 512 * (section_target_bandwitdh / num_mbs);
@@ -1178,10 +1155,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
   } else {
     current_spend_ratio = (double)cpi->long_rolling_actual_bits /
                           (double)cpi->long_rolling_target_bits;
-    current_spend_ratio =
-        (current_spend_ratio > 10.0)
-            ? 10.0
-            : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
+    current_spend_ratio = (current_spend_ratio > 10.0)  ? 10.0
+                          : (current_spend_ratio < 0.1) ? 0.1
+                                                        : current_spend_ratio;
   }
 
   /* Calculate a correction factor based on the quality of prediction in
@@ -1232,17 +1208,6 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
     Q++;
   }
 
-  if (0) {
-    FILE *f = fopen("estkf_q.stt", "a");
-    fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n",
-            cpi->common.current_video_frame, bits_per_mb_at_this_q,
-            target_norm_bits_per_mb, err_per_mb, err_correction_factor,
-            current_spend_ratio, group_iiratio, iiratio_correction_factor,
-            (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level,
-            Q);
-    fclose(f);
-  }
-
   return Q;
 }
 
@@ -1539,7 +1504,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
     /* Calculate the baseline boost number for this frame */
     r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out);
 
-    /* We want to discount the the flash frame itself and the recovery
+    /* We want to discount the flash frame itself and the recovery
      * frame that follows as both will have poor scores.
      */
     flash_detected =
@@ -1583,7 +1548,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
     /* Calculate the baseline boost number for this frame */
     r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out);
 
-    /* We want to discount the the flash frame itself and the recovery
+    /* We want to discount the flash frame itself and the recovery
      * frame that follows as both will have poor scores.
      */
     flash_detected =
@@ -1631,7 +1596,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
-  double mod_err_per_mb_accumulator = 0.0;
 
   int max_bits = frame_max_bits(cpi); /* Max for a single frame */
 
@@ -1682,9 +1646,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     gf_group_err += mod_frame_err;
 
-    mod_err_per_mb_accumulator +=
-        mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
-
     if (EOF == input_stats(cpi, &next_frame)) break;
 
     /* Test for the case where there is a brief flash but the prediction
@@ -1723,9 +1684,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         /* Break at cpi->max_gf_interval unless almost totally static */
         (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
         (
-            /* Dont break out with a very short interval */
+            /* Don't break out with a very short interval */
             (i > MIN_GF_INTERVAL) &&
-            /* Dont break out very close to a key frame */
+            /* Don't break out very close to a key frame */
             ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
             ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
             (!flash_detected) &&
@@ -1771,7 +1732,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (boost_score > max_boost) boost_score = max_boost;
   }
 
-  /* Dont allow conventional gf too near the next kf */
+  /* Don't allow conventional gf too near the next kf */
   if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < cpi->twopass.frames_to_key) {
       i++;
@@ -1792,9 +1753,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   alt_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
 #endif
 
-  /* Should we use the alternate refernce frame */
+  /* Should we use the alternate reference frame */
   if (allow_alt_ref && (i >= MIN_GF_INTERVAL) &&
-      /* dont use ARF very near next kf */
+      /* don't use ARF very near next kf */
       (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
 #if NEW_BOOST
       ((next_frame.pcnt_inter > 0.75) || (next_frame.pcnt_second_ref > 0.5)) &&
@@ -1972,11 +1933,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 
   cpi->twopass.gf_group_bits =
-      (cpi->twopass.gf_group_bits < 0)
-          ? 0
-          : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-                ? cpi->twopass.kf_group_bits
-                : cpi->twopass.gf_group_bits;
+      (cpi->twopass.gf_group_bits < 0) ? 0
+      : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+          ? cpi->twopass.kf_group_bits
+          : cpi->twopass.gf_group_bits;
 
   /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
    * variability limit (cpi->oxcf.two_pass_vbrmax_section)
@@ -2089,7 +2049,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       }
     }
 
-    /* Dont allow a negative value for gf_bits */
+    /* Don't allow a negative value for gf_bits */
     if (gf_bits < 0) gf_bits = 0;
 
     /* Add in minimum for a frame */
@@ -2130,7 +2090,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0;
 
     /* This condition could fail if there are two kfs very close together
-     * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+     * despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
      * calculation of cpi->twopass.alt_extra_bits.
      */
     if (cpi->baseline_gf_interval >= 3) {
@@ -2400,7 +2360,7 @@ void vp8_second_pass(VP8_COMP *cpi) {
   }
 
   /* The last few frames of a clip almost always have to few or too many
-   * bits and for the sake of over exact rate control we dont want to make
+   * bits and for the sake of over exact rate control we don't want to make
    * radical adjustments to the allowed quantizer range just to use up a
    * few surplus bits or get beneath the target rate.
    */
@@ -2997,8 +2957,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     /* Set back to unscaled by defaults */
-    cpi->common.horiz_scale = NORMAL;
-    cpi->common.vert_scale = NORMAL;
+    cpi->common.horiz_scale = VP8E_NORMAL;
+    cpi->common.vert_scale = VP8E_NORMAL;
 
     /* Calculate Average bits per frame. */
     av_bits_per_frame = cpi->oxcf.target_bandwidth /
@@ -3018,7 +2978,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       bits_per_frame =
           (double)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key);
 
-      /* Dont turn to resampling in easy sections just because they
+      /* Don't turn to resampling in easy sections just because they
        * have been assigned a small number of bits
        */
       if (bits_per_frame < av_bits_per_frame) {
@@ -3054,16 +3014,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
               (int)((projected_bits_perframe - av_bits_per_frame) *
                     cpi->twopass.frames_to_key));
 
-    if (0) {
-      FILE *f = fopen("Subsamle.stt", "a");
-      fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",
-              cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale,
-              cpi->common.vert_scale, kf_group_err / cpi->twopass.frames_to_key,
-              (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key),
-              new_height, new_width);
-      fclose(f);
-    }
-
     /* The trigger for spatial resampling depends on the various
      * parameters such as whether we are streaming (CBR) or VBR.
      */
@@ -3127,17 +3077,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
          */
         kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio,
                                    (int)bits_per_frame, group_iiratio);
-
-        if (0) {
-          FILE *f = fopen("Subsamle.stt", "a");
-          fprintf(
-              f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q,
-              cpi->common.horiz_scale, cpi->common.vert_scale,
-              kf_group_err / cpi->twopass.frames_to_key,
-              (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key),
-              new_height, new_width);
-          fclose(f);
-        }
       }
     }
 
diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c
index 37aa9eee84..49f851d019 100644
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -66,8 +66,8 @@ struct lookahead_ctx *vp8_lookahead_init(unsigned int width,
   depth += 1;
 
   /* Align the buffer dimensions */
-  width = (width + 15) & ~15;
-  height = (height + 15) & ~15;
+  width = (width + 15) & ~15u;
+  height = (height + 15) & ~15u;
 
   /* Allocate the lookahead structures */
   ctx = calloc(1, sizeof(*ctx));
diff --git a/vp8/encoder/loongarch/dct_lsx.c b/vp8/encoder/loongarch/dct_lsx.c
new file mode 100644
index 0000000000..a08d4d3f63
--- /dev/null
+++ b/vp8/encoder/loongarch/dct_lsx.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdint.h>
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
+                                                                               \
+    DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
+    DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
+    _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
+    _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
+    DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
+    DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
+  }
+
+#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)           \
+  {                                                                        \
+    __m128i tmp0_m, tmp1_m, tmp2_m;                                        \
+                                                                           \
+    tmp0_m = __lsx_vreplvei_h(coeff, val0);                                \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \
+    DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1,     \
+              const2);                                                     \
+  }
+
+#define RET_1_IF_NZERO_H(_in)           \
+  ({                                    \
+    __m128i tmp_m;                      \
+    __m128i one_m = __lsx_vldi(0x401);  \
+    __m128i max_m = __lsx_vldi(0xFF);   \
+                                        \
+    tmp_m = __lsx_vseqi_h(_in, 0);      \
+    tmp_m = __lsx_vxor_v(tmp_m, max_m); \
+    tmp_m = __lsx_vand_v(tmp_m, one_m); \
+                                        \
+    tmp_m;                              \
+  })
+
+void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
+  __m128i in0, in1, in2, in3;
+  __m128i tmp0, tmp1, tmp2, tmp3, const0, const1;
+  __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
+  __m128i out0, out1, out2, out3;
+  __m128i zero = __lsx_vldi(0);
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
+  in3 = __lsx_vldx(input, pitch3);
+
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
+  DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1,
+            in3);
+  in0 = __lsx_vadd_h(tmp0, tmp1);
+  in2 = __lsx_vsub_h(tmp0, tmp1);
+  SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
+  tmp0 = __lsx_vilvl_h(in3, in1);
+  in1 = __lsx_vreplvei_h(coeff, 3);
+  out0 = __lsx_vpackev_h(zero, in1);
+  coeff = __lsx_vilvl_h(zero, coeff);
+  out1 = __lsx_vreplvei_w(coeff, 0);
+  DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0,
+            out1);
+  DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
+  tmp2 = __lsx_vadd_h(tmp0, tmp1);
+  tmp3 = __lsx_vsub_h(tmp0, tmp1);
+  DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2);
+  DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2);
+  DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2);
+  tmp1 = RET_1_IF_NZERO_H(in3);
+  DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0);
+  DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1);
+  out3 = __lsx_vadd_w(out3, out1);
+  out1 = __lsx_vreplvei_w(coeff, 1);
+  DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1,
+            out3);
+  DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3);
+  out1 = __lsx_vadd_w(out1, tmp1);
+  DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2);
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in2, output, 16);
+}
+
+void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
+  __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1;
+  __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w;
+  __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
+  __m128i zero = __lsx_vldi(0);
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
+  in3 = __lsx_vldx(input, pitch3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1,
+            in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
+  temp0 = __lsx_vreplvei_h(coeff, 3);
+  vec1_w = __lsx_vpackev_h(zero, temp0);
+  coeff = __lsx_vilvh_h(zero, coeff);
+  vec3_w = __lsx_vreplvei_w(coeff, 0);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in0 = __lsx_vaddi_hu(in0, 7);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  in2 = __lsx_vaddi_hu(in2, 7);
+  in0 = __lsx_vsrai_h(in0, 4);
+  in2 = __lsx_vsrai_h(in2, 4);
+  DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w);
+  vec3_w = __lsx_vadd_w(vec3_w, vec1_w);
+  vec1_w = __lsx_vreplvei_w(coeff, 1);
+  const0 = RET_1_IF_NZERO_H(in3);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3);
+  in1 = __lsx_vadd_h(in1, const0);
+  DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1);
+  __lsx_vst(temp0, output, 0);
+  __lsx_vst(temp1, output, 16);
+
+  DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2);
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(in2, output, 48);
+}
diff --git a/vp8/encoder/loongarch/encodeopt_lsx.c b/vp8/encoder/loongarch/encodeopt_lsx.c
new file mode 100644
index 0000000000..4ad4caba60
--- /dev/null
+++ b/vp8/encoder/loongarch/encodeopt_lsx.c
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp8/encoder/block.h"
+
+int32_t vp8_block_error_lsx(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) {
+  int32_t err = 0;
+  __m128i dq_coeff0, dq_coeff1, coeff0, coeff1;
+  __m128i reg0, reg1, reg2, reg3, error;
+
+  DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, dq_coeff_ptr, 0,
+            dq_coeff_ptr, 16, coeff0, coeff1, dq_coeff0, dq_coeff1);
+  DUP2_ARG2(__lsx_vsubwev_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg0,
+            reg2);
+  DUP2_ARG2(__lsx_vsubwod_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg1,
+            reg3);
+  error = __lsx_vmul_w(reg0, reg0);
+  DUP2_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, error);
+  error = __lsx_vmadd_w(error, reg3, reg3);
+  error = __lsx_vhaddw_d_w(error, error);
+  err = __lsx_vpickve2gr_w(error, 0);
+  err += __lsx_vpickve2gr_w(error, 2);
+  return err;
+}
+
+int32_t vp8_mbblock_error_lsx(MACROBLOCK *mb, int32_t dc) {
+  BLOCK *be;
+  BLOCKD *bd;
+  int16_t *coeff, *dq_coeff;
+  int32_t err = 0;
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, error;
+  __m128i mask0 = __lsx_vldi(0xFF);
+  __m128i zero = __lsx_vldi(0);
+
+  if (dc == 1) {
+    mask0 = __lsx_vinsgr2vr_w(mask0, 0, 0);
+  }
+
+  for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) {
+    int32_t loop_tmp = loop_cnt << 1;
+    be = &mb->block[loop_tmp];
+    bd = &mb->e_mbd.block[loop_tmp];
+    coeff = be->coeff;
+    dq_coeff = bd->dqcoeff;
+    DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src0,
+              src1, tmp0, tmp1);
+    be = &mb->block[loop_tmp + 1];
+    bd = &mb->e_mbd.block[loop_tmp + 1];
+    coeff = be->coeff;
+    dq_coeff = bd->dqcoeff;
+    DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src2,
+              src3, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vsubwev_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vsubwod_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3,
+              reg1, reg3, reg5, reg7);
+    DUP2_ARG3(__lsx_vbitsel_v, zero, reg0, mask0, zero, reg4, mask0, reg0,
+              reg4);
+    error = __lsx_vmul_w(reg0, reg0);
+    DUP4_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, reg3,
+              reg3, error, reg4, reg4, error, error, error, error);
+    DUP2_ARG3(__lsx_vmadd_w, error, reg5, reg5, error, reg6, reg6, error,
+              error);
+    error = __lsx_vmadd_w(error, reg7, reg7);
+    error = __lsx_vhaddw_d_w(error, error);
+    error = __lsx_vhaddw_q_d(error, error);
+    err += __lsx_vpickve2gr_w(error, 0);
+  }
+  return err;
+}
diff --git a/vp8/encoder/loongarch/vp8_quantize_lsx.c b/vp8/encoder/loongarch/vp8_quantize_lsx.c
new file mode 100644
index 0000000000..75889192a7
--- /dev/null
+++ b/vp8/encoder/loongarch/vp8_quantize_lsx.c
@@ -0,0 +1,145 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdint.h>
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp8/encoder/block.h"
+
+#define BOOST_QUANT1(_in0, _in1, _in2, _ui)               \
+  {                                                       \
+    if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \
+      if (__lsx_vpickve2gr_h(_in1, _ui)) {                \
+        eob = _ui;                                        \
+        boost_temp = zbin_boost;                          \
+      } else {                                            \
+        boost_temp++;                                     \
+      }                                                   \
+    } else {                                              \
+      _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui);             \
+      boost_temp++;                                       \
+    }                                                     \
+  }
+
+#define BOOST_QUANT2(_in0, _in1, _in2, _ui)               \
+  {                                                       \
+    if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \
+      if (__lsx_vpickve2gr_h(_in1, _ui)) {                \
+        eob = _ui + 8;                                    \
+        boost_temp = zbin_boost;                          \
+      } else {                                            \
+        boost_temp++;                                     \
+      }                                                   \
+    } else {                                              \
+      _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui);             \
+      boost_temp++;                                       \
+    }                                                     \
+  }
+
+static int8_t exact_regular_quantize_b_lsx(
+    int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round,
+    int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in,
+    int16_t *q_coeff, int16_t *dq_coeff) {
+  int32_t eob;
+  int16_t *boost_temp = zbin_boost;
+  __m128i inv_zig_zag = { 0x0C07040206050100, 0x0F0E0A090D0B0803 };
+  __m128i sign_z0, sign_z1, q_coeff0, q_coeff1;
+  __m128i z_bin0, z_bin1, zbin_o_q, x0, x1, sign_x0, sign_x1, de_quant0,
+      de_quant1;
+  __m128i z0, z1, round0, round1, quant0, quant2;
+  __m128i inv_zig_zag0, inv_zig_zag1;
+  __m128i zigzag_mask0 = { 0x0008000400010000, 0x0006000300020005 };
+  __m128i zigzag_mask1 = { 0x000A000D000C0009, 0X000F000E000B0007 };
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i zero = __lsx_vldi(0);
+
+  zbin_o_q = __lsx_vreplgr2vr_h(zbin_oq_in);
+  inv_zig_zag0 = __lsx_vilvl_b(zero, inv_zig_zag);
+  inv_zig_zag1 = __lsx_vilvh_b(zero, inv_zig_zag);
+  eob = -1;
+  DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, round, 0, round, 16, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0,
+            zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, z0, z1, round0,
+            round1);
+  DUP4_ARG2(__lsx_vld, quant, 0, quant, 16, zbin, 0, zbin, 16, tmp0, tmp1, tmp2,
+            tmp3);
+  DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0,
+            zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, quant0, quant2,
+            z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vsrai_h, z0, 15, z1, 15, sign_z0, sign_z1);
+  DUP2_ARG2(__lsx_vadda_h, z0, zero, z1, zero, x0, x1);
+  DUP2_ARG2(__lsx_vsub_h, x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vsub_h, z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vmulwev_w_h, quant0, round0, quant2, round1, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vmulwod_w_h, quant0, round0, quant2, round1, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2);
+  DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3);
+  DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, q_coeff0, q_coeff1);
+
+  DUP2_ARG2(__lsx_vld, quant_shift, 0, quant_shift, 16, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp3, tmp1, zigzag_mask1, tmp3, tmp1,
+            quant0, quant2);
+  DUP2_ARG2(__lsx_vadd_h, x0, round0, x1, round1, x0, x1);
+  DUP2_ARG2(__lsx_vmulwev_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vmulwod_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2);
+  DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3);
+  DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, x0, x1);
+  DUP2_ARG2(__lsx_vxor_v, x0, sign_z0, x1, sign_z1, sign_x0, sign_x1);
+  DUP2_ARG2(__lsx_vsub_h, sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
+
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 0);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 1);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 2);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 3);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 4);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 5);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 6);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 7);
+
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 0);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 1);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 2);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 3);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 4);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 5);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 6);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 7);
+
+  DUP2_ARG2(__lsx_vld, de_quant, 0, de_quant, 16, de_quant0, de_quant1);
+  DUP2_ARG3(__lsx_vshuf_h, inv_zig_zag0, sign_x1, sign_x0, inv_zig_zag1,
+            sign_x1, sign_x0, q_coeff0, q_coeff1);
+  DUP2_ARG2(__lsx_vmul_h, de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0,
+            de_quant1);
+  __lsx_vst(q_coeff0, q_coeff, 0);
+  __lsx_vst(q_coeff1, q_coeff, 16);
+  __lsx_vst(de_quant0, dq_coeff, 0);
+  __lsx_vst(de_quant1, dq_coeff, 16);
+
+  return (int8_t)(eob + 1);
+}
+
+void vp8_regular_quantize_b_lsx(BLOCK *b, BLOCKD *d) {
+  int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+  int16_t *coeff_ptr = b->coeff;
+  int16_t *zbin_ptr = b->zbin;
+  int16_t *round_ptr = b->round;
+  int16_t *quant_ptr = b->quant;
+  int16_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  int16_t *dequant_ptr = d->dequant;
+  int16_t zbin_oq_value = b->zbin_extra;
+
+  *d->eob = exact_regular_quantize_b_lsx(
+      zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
+      quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr);
+}
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 9e7f5c7ace..bc150e482b 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -204,19 +204,22 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) {
 /* returns distortion + motion vector cost */
 #define ERR(r, c) (MVC(r, c) + DIST(r, c))
 /* checks if (r,c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                           \
-  IFMVCV(r, c,                                          \
-         {                                              \
-           thismse = DIST(r, c);                        \
-           if ((v = (MVC(r, c) + thismse)) < besterr) { \
-             besterr = v;                               \
-             br = r;                                    \
-             bc = c;                                    \
-             *distortion = thismse;                     \
-             *sse1 = sse;                               \
-           }                                            \
-         },                                             \
-         v = UINT_MAX;)
+#define CHECK_BETTER(v, r, c)                          \
+  do {                                                 \
+    IFMVCV(                                            \
+        r, c,                                          \
+        {                                              \
+          thismse = DIST(r, c);                        \
+          if ((v = (MVC(r, c) + thismse)) < besterr) { \
+            besterr = v;                               \
+            br = r;                                    \
+            bc = c;                                    \
+            *distortion = thismse;                     \
+            *sse1 = sse;                               \
+          }                                            \
+        },                                             \
+        v = UINT_MAX;)                                 \
+  } while (0)
 
 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int_mv *bestmv, int_mv *ref_mv,
@@ -800,13 +803,13 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 }
 
 #define CHECK_BOUNDS(range)                    \
-  {                                            \
+  do {                                         \
     all_in = 1;                                \
     all_in &= ((br - range) >= x->mv_row_min); \
     all_in &= ((br + range) <= x->mv_row_max); \
     all_in &= ((bc - range) >= x->mv_col_min); \
     all_in &= ((bc + range) <= x->mv_col_max); \
-  }
+  } while (0)
 
 #define CHECK_POINT                                  \
   {                                                  \
@@ -817,7 +820,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   }
 
 #define CHECK_BETTER                                                     \
-  {                                                                      \
+  do {                                                                   \
     if (thissad < bestsad) {                                             \
       thissad +=                                                         \
           mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); \
@@ -826,7 +829,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
         best_site = i;                                                   \
       }                                                                  \
     }                                                                    \
-  }
+  } while (0)
 
 static const MV next_chkpts[6][3] = {
   { { -2, 0 }, { -1, -2 }, { 1, -2 } }, { { -1, -2 }, { 1, -2 }, { 2, 0 } },
@@ -901,7 +904,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 #endif
 
   /* hex search */
-  CHECK_BOUNDS(2)
+  CHECK_BOUNDS(2);
 
   if (all_in) {
     for (i = 0; i < 6; ++i) {
@@ -910,7 +913,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
       this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
                     this_mv.as_mv.col;
       thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-      CHECK_BETTER
+      CHECK_BETTER;
     }
   } else {
     for (i = 0; i < 6; ++i) {
@@ -920,7 +923,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
       this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
                     this_mv.as_mv.col;
       thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-      CHECK_BETTER
+      CHECK_BETTER;
     }
   }
 
@@ -934,7 +937,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 
   for (j = 1; j < hex_range; ++j) {
     best_site = -1;
-    CHECK_BOUNDS(2)
+    CHECK_BOUNDS(2);
 
     if (all_in) {
       for (i = 0; i < 3; ++i) {
@@ -943,7 +946,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     } else {
       for (i = 0; i < 3; ++i) {
@@ -953,7 +956,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     }
 
@@ -975,7 +978,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 cal_neighbors:
   for (j = 0; j < dia_range; ++j) {
     best_site = -1;
-    CHECK_BOUNDS(1)
+    CHECK_BOUNDS(1);
 
     if (all_in) {
       for (i = 0; i < 4; ++i) {
@@ -984,7 +987,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     } else {
       for (i = 0; i < 4; ++i) {
@@ -994,7 +997,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     }
 
@@ -1120,14 +1123,14 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_mv.as_mv.row = best_mv->as_mv.row * 8;
+  this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
-#if HAVE_SSE2 || HAVE_MSA
+#if HAVE_SSE2 || HAVE_MSA || HAVE_LSX
 int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                              int_mv *best_mv, int search_param, int sad_per_bit,
                              int *num00, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1276,99 +1279,12 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
-#endif  // HAVE_SSE2 || HAVE_MSA
-
-int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  unsigned char *in_what;
-  int pre_stride = x->e_mbd.pre.y_stride;
-  unsigned char *base_pre = x->e_mbd.pre.y_buffer;
-  int in_what_stride = pre_stride;
-  int mv_stride = pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.mv;
-  int_mv this_mv;
-  unsigned int bestsad;
-  unsigned int thissad;
-  int r, c;
-
-  unsigned char *check_here;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  int *mvsadcost[2];
-  int_mv fcenter_mv;
-
-  mvsadcost[0] = x->mvsadcost[0];
-  mvsadcost[1] = x->mvsadcost[1];
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  /* Work out the mid point for the search */
-  in_what = base_pre + d->offset;
-  bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  /* Baseline value at the centre */
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
-            mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-  /* Apply further limits to prevent us looking using vectors that
-   * stretch beyiond the UMV border
-   */
-  if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; ++r) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-
-    for (c = col_min; c < col_max; ++c) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
-      this_mv.as_mv.col = c;
-      thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-      if (thissad < bestsad) {
-        bestsad = thissad;
-        best_mv->as_mv.row = r;
-        best_mv->as_mv.col = c;
-        bestaddress = check_here;
-      }
-
-      check_here++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
-         mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
+#endif  // HAVE_SSE2 || HAVE_MSA || HAVE_LSX
 
-#if HAVE_SSSE3
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv) {
   unsigned char *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
   unsigned char *in_what;
@@ -1393,130 +1309,6 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   int col_min = ref_col - distance;
   int col_max = ref_col + distance;
 
-  unsigned int sad_array[3];
-
-  int *mvsadcost[2];
-  int_mv fcenter_mv;
-
-  mvsadcost[0] = x->mvsadcost[0];
-  mvsadcost[1] = x->mvsadcost[1];
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  /* Work out the mid point for the search */
-  in_what = base_pre + d->offset;
-  bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  /* Baseline value at the centre */
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
-            mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-  /* Apply further limits to prevent us looking using vectors that stretch
-   * beyond the UMV border
-   */
-  if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; ++r) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; ++i) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
-      if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad +=
-            mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->as_mv.row = r;
-          best_mv->as_mv.col = c;
-          bestaddress = check_here;
-        }
-      }
-
-      check_here++;
-      c++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
-         mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
-#endif  // HAVE_SSSE3
-
-#if HAVE_SSE4_1
-int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  int pre_stride = x->e_mbd.pre.y_stride;
-  unsigned char *base_pre = x->e_mbd.pre.y_buffer;
-  unsigned char *in_what;
-  int in_what_stride = pre_stride;
-  int mv_stride = pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.mv;
-  int_mv this_mv;
-  unsigned int bestsad;
-  unsigned int thissad;
-  int r, c;
-
-  unsigned char *check_here;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
-  unsigned int sad_array[3];
-
   int *mvsadcost[2];
   int_mv fcenter_mv;
 
@@ -1550,61 +1342,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   for (r = row_min; r < row_max; ++r) {
     this_mv.as_mv.row = r;
     check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 7) < col_max) {
-      int i;
-
-      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
 
-      for (i = 0; i < 8; ++i) {
-        thissad = sad_array8[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; ++i) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
+    for (c = col_min; c < col_max; ++c) {
       thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
       if (thissad < bestsad) {
@@ -1621,7 +1360,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
       }
 
       check_here++;
-      c++;
     }
   }
 
@@ -1631,7 +1369,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
-#endif  // HAVE_SSE4_1
 
 int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                               int_mv *ref_mv, int error_per_bit,
@@ -1704,8 +1441,8 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 57c18f523f..1ee6fe5dd6 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -50,10 +50,10 @@ fractional_mv_step_fp vp8_find_best_sub_pixel_step;
 fractional_mv_step_fp vp8_find_best_half_pixel_step;
 fractional_mv_step_fp vp8_skip_fractional_mv_step;
 
-typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                                    int_mv *ref_mv, int sad_per_bit,
-                                    int distance, vp8_variance_fn_ptr_t *fn_ptr,
-                                    int *mvcost[2], int_mv *center_mv);
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv);
 
 typedef int (*vp8_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                         int_mv *ref_mv, int sad_per_bit,
diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c
index 1f60a692d0..0fd25fcda5 100644
--- a/vp8/encoder/mips/mmi/dct_mmi.c
+++ b/vp8/encoder/mips/mmi/dct_mmi.c
@@ -24,19 +24,19 @@
   "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp0]         \n\t" \
   "punpcklhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
   "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
-  "or         %[ftmp5],   %[ftmp5],   %[ftmp9]         \n\t" \
+  "por        %[ftmp5],   %[ftmp5],   %[ftmp9]         \n\t" \
   "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp0]         \n\t" \
   "punpckhhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
   "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
-  "or         %[ftmp6],   %[ftmp6],   %[ftmp9]         \n\t" \
+  "por        %[ftmp6],   %[ftmp6],   %[ftmp9]         \n\t" \
   "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp0]         \n\t" \
   "punpcklhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
   "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
-  "or         %[ftmp7],   %[ftmp7],   %[ftmp9]         \n\t" \
+  "por        %[ftmp7],   %[ftmp7],   %[ftmp9]         \n\t" \
   "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp0]         \n\t" \
   "punpckhhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
   "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
-  "or         %[ftmp8],   %[ftmp8],   %[ftmp9]         \n\t" \
+  "por        %[ftmp8],   %[ftmp8],   %[ftmp9]         \n\t" \
   "punpcklwd  %[ftmp1],   %[ftmp5],   %[ftmp7]         \n\t" \
   "punpckhwd  %[ftmp2],   %[ftmp5],   %[ftmp7]         \n\t" \
   "punpcklwd  %[ftmp3],   %[ftmp6],   %[ftmp8]         \n\t" \
@@ -46,6 +46,7 @@
 void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
   uint64_t tmp[1];
   int16_t *ip = input;
+  double ff_ph_op1, ff_ph_op3;
 
 #if _MIPS_SIM == _ABIO32
   register double ftmp0 asm("$f0");
@@ -83,14 +84,17 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
+    "dli        %[tmp0],    0x14e808a914e808a9              \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_op1]                    \n\t"
+    "dli        %[tmp0],    0xeb1808a9eb1808a9              \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_op3]                    \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
     MMI_ADDU(%[ip], %[ip], %[pitch])
@@ -129,7 +133,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
 
     // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
     MMI_LI(%[tmp0], 0x0c)
-    "mtc1       %[tmp0],    %[ftmp11]                       \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                       \n\t"
     "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
     "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
     "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
@@ -169,7 +173,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
     "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
     MMI_LI(%[tmp0], 0x04)
-    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
     "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
     "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
 
@@ -211,15 +215,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
       [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
       [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
       [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
-      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
+      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip),
+      [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3)
     : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
-      [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
       [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
       [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
       [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
       [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
     : "memory"
   );
+  /* clang-format on */
 }
 
 void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
@@ -228,17 +233,22 @@ void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
 }
 
 void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
-  double ftmp[13];
-  uint32_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
-  DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
+  double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask;
+  uint64_t tmp[1];
 
+  /* clang-format off */
   __asm__ volatile (
+    "dli        %[tmp0],    0x0001000100010001                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                         \n\t"
+    "dli        %[tmp0],    0x0000000100000001                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_01]                         \n\t"
+    "dli        %[tmp0],    0x0000000300000003                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_03]                         \n\t"
+    "dli        %[tmp0],    0x0001000000010000                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_mask]                       \n\t"
     MMI_LI(%[tmp0], 0x02)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
 
     "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
@@ -337,52 +347,52 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"
 
     MMI_LI(%[tmp0], 0x03)
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
 
     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
-    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
     "paddw      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
     "paddw      %[ftmp1],   %[ftmp1],       %[ff_pw_03]         \n\t"
     "psraw      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
 
     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp2]            \n\t"
-    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
     "paddw      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
     "paddw      %[ftmp2],   %[ftmp2],       %[ff_pw_03]         \n\t"
     "psraw      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
 
     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp3]            \n\t"
-    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
     "paddw      %[ftmp3],   %[ftmp3],       %[ff_pw_03]         \n\t"
     "psraw      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
 
     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp4]            \n\t"
-    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
     "paddw      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
     "paddw      %[ftmp4],   %[ftmp4],       %[ff_pw_03]         \n\t"
     "psraw      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
 
     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp5]            \n\t"
-    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
     "paddw      %[ftmp5],   %[ftmp5],       %[ftmp9]            \n\t"
     "paddw      %[ftmp5],   %[ftmp5],       %[ff_pw_03]         \n\t"
     "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]           \n\t"
 
     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp6]            \n\t"
-    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
     "paddw      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
     "paddw      %[ftmp6],   %[ftmp6],       %[ff_pw_03]         \n\t"
     "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]           \n\t"
 
     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp7]            \n\t"
-    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
     "paddw      %[ftmp7],   %[ftmp7],       %[ftmp9]            \n\t"
     "paddw      %[ftmp7],   %[ftmp7],       %[ff_pw_03]         \n\t"
     "psraw      %[ftmp7],   %[ftmp7],       %[ftmp11]           \n\t"
 
     "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp8]            \n\t"
-    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
     "paddw      %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t"
     "paddw      %[ftmp8],   %[ftmp8],       %[ff_pw_03]         \n\t"
     "psraw      %[ftmp8],   %[ftmp8],       %[ftmp11]           \n\t"
@@ -393,7 +403,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
 
     MMI_LI(%[tmp0], 0x72)
-    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
     "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
     "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
     "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
@@ -413,13 +423,12 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
       [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
-      [ftmp12]"=&f"(ftmp[12]),
-      [tmp0]"=&r"(tmp[0]),
-      [ip]"+&r"(input)
-    : [op]"r"(output),
-      [ff_pw_01]"f"(ff_pw_01),          [pitch]"r"((mips_reg)pitch),
-      [ff_pw_03]"f"(ff_pw_03),          [ff_pw_mask]"f"(ff_pw_mask),
-      [ff_ph_01]"f"(ff_ph_01)
+      [ftmp12]"=&f"(ftmp[12]),          [ff_pw_mask]"=&f"(ff_pw_mask),
+      [tmp0]"=&r"(tmp[0]),              [ff_pw_01]"=&f"(ff_pw_01),
+      [ip]"+&r"(input),                 [ff_pw_03]"=&f"(ff_pw_03),
+      [ff_ph_01]"=&f"(ff_ph_01)
+    : [op]"r"(output),                  [pitch]"r"((mips_reg)pitch)
     : "memory"
   );
+  /* clang-format on */
 }
diff --git a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
index 3ccb196ffb..1986444aa3 100644
--- a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
+++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -42,24 +42,25 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
 
   double ftmp[13];
   uint64_t tmp[1];
-  DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };
-  int eob = 0;
+  int64_t eob = 0;
+  double ones;
 
   __asm__ volatile(
       // loop 0 ~ 7
-      "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ones],    %[ones],        %[ones]         \n\t"
       "gsldlc1    %[ftmp1],   0x07(%[coeff_ptr])              \n\t"
       "gsldrc1    %[ftmp1],   0x00(%[coeff_ptr])              \n\t"
-      "li         %[tmp0],    0x0f                            \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0x0f                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
       "gsldlc1    %[ftmp2],   0x0f(%[coeff_ptr])              \n\t"
       "gsldrc1    %[ftmp2],   0x08(%[coeff_ptr])              \n\t"
 
       "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
-      "xor        %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "pxor       %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
       "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
       "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
-      "xor        %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "pxor       %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
       "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
 
       "gsldlc1    %[ftmp5],   0x07(%[round_ptr])              \n\t"
@@ -75,8 +76,8 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
       "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
       "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
 
-      "xor        %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
-      "xor        %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "pxor       %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "pxor       %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
       "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
       "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
       "gssdlc1    %[ftmp7],   0x07(%[qcoeff_ptr])             \n\t"
@@ -90,10 +91,10 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
       "gsldrc1    %[ftmp2],   0x08(%[inv_zig_zag])            \n\t"
       "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
       "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
-      "xor        %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
-      "xor        %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
-      "and        %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
-      "and        %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pxor       %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "pxor       %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "pand       %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "pand       %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
       "pmaxsh     %[ftmp10],  %[ftmp5],       %[ftmp6]        \n\t"
 
       "gsldlc1    %[ftmp5],   0x07(%[dequant_ptr])            \n\t"
@@ -114,10 +115,10 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
       "gsldrc1    %[ftmp2],   0x18(%[coeff_ptr])              \n\t"
 
       "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
-      "xor        %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "pxor       %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
       "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
       "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
-      "xor        %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "pxor       %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
       "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
 
       "gsldlc1    %[ftmp5],   0x17(%[round_ptr])              \n\t"
@@ -133,8 +134,8 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
       "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
       "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
 
-      "xor        %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
-      "xor        %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "pxor       %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "pxor       %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
       "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
       "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
       "gssdlc1    %[ftmp7],   0x17(%[qcoeff_ptr])             \n\t"
@@ -148,10 +149,10 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
       "gsldrc1    %[ftmp2],   0x18(%[inv_zig_zag])            \n\t"
       "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
       "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
-      "xor        %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
-      "xor        %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
-      "and        %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
-      "and        %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pxor       %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "pxor       %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "pand       %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "pand       %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
       "pmaxsh     %[ftmp11],  %[ftmp5],       %[ftmp6]        \n\t"
 
       "gsldlc1    %[ftmp5],   0x17(%[dequant_ptr])            \n\t"
@@ -165,34 +166,34 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
       "gssdlc1    %[ftmp6],   0x1f(%[dqcoeff_ptr])            \n\t"
       "gssdrc1    %[ftmp6],   0x18(%[dqcoeff_ptr])            \n\t"
 
-      "li         %[tmp0],    0x10                            \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0x10                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
 
       "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
       "psrlw      %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
       "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
-      "li         %[tmp0],    0xaa                            \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "dli        %[tmp0],    0xaa                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
       "pshufh     %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
       "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
-      "li         %[tmp0],    0xffff                          \n\t"
-      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
-      "and        %[ftmp10],  %[ftmp10],       %[ftmp9]       \n\t"
+      "dli        %[tmp0],    0xffff                          \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
+      "pand       %[ftmp10],  %[ftmp10],       %[ftmp9]       \n\t"
       "gssdlc1    %[ftmp10],  0x07(%[eob])                    \n\t"
       "gssdrc1    %[ftmp10],  0x00(%[eob])                    \n\t"
       : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
         [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
         [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
         [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
-        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+        [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones)
       : [coeff_ptr] "r"((mips_reg)coeff_ptr),
         [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
         [dequant_ptr] "r"((mips_reg)dequant_ptr),
         [round_ptr] "r"((mips_reg)round_ptr),
         [quant_ptr] "r"((mips_reg)quant_ptr),
         [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
-        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),
-        [ones] "f"(ones)
+        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob)
       : "memory");
 
   *d->eob = eob;
@@ -217,7 +218,7 @@ void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
   //  memset(dqcoeff_ptr, 0, 32);
   /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
     "gssdlc1    %[ftmp0],   0x07(%[qcoeff_ptr])             \n\t"
     "gssdrc1    %[ftmp0],   0x00(%[qcoeff_ptr])             \n\t"
     "gssdlc1    %[ftmp0],   0x0f(%[qcoeff_ptr])             \n\t"
diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c
index 011b62a08f..b1bfb4b54a 100644
--- a/vp8/encoder/mr_dissim.c
+++ b/vp8/encoder/mr_dissim.c
@@ -49,7 +49,6 @@ void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) {
 
 void vp8_cal_dissimilarity(VP8_COMP *cpi) {
   VP8_COMMON *cm = &cpi->common;
-  int i;
 
   /* Note: The first row & first column in mip are outside the frame, which
    * were initialized to all 0.(ref_frame, mode, mv...)
@@ -67,6 +66,7 @@ void vp8_cal_dissimilarity(VP8_COMP *cpi) {
     store_info->frame_type = cm->frame_type;
 
     if (cm->frame_type != KEY_FRAME) {
+      int i;
       store_info->is_frame_dropped = 0;
       for (i = 1; i < MAX_REF_FRAMES; ++i)
         store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i];
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index aeed719d1f..4e128e3c49 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -36,6 +36,7 @@
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_util/vpx_write_yuv_frame.h"
 #if VPX_ARCH_ARM
@@ -183,7 +184,7 @@ static const unsigned char inter_minq[QINDEX_RANGE] = {
 extern FILE *vpxlogc;
 #endif
 
-static void save_layer_context(VP8_COMP *cpi) {
+void vp8_save_layer_context(VP8_COMP *cpi) {
   LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
 
   /* Save layer dependent coding state */
@@ -222,7 +223,7 @@ static void save_layer_context(VP8_COMP *cpi) {
          sizeof(cpi->mb.count_mb_ref_frame_usage));
 }
 
-static void restore_layer_context(VP8_COMP *cpi, const int layer) {
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   /* Restore layer dependent coding state */
@@ -269,9 +270,9 @@ static int rescale(int val, int num, int denom) {
   return (int)(llval * llnum / llden);
 }
 
-static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
-                                        const int layer,
-                                        double prev_layer_framerate) {
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
@@ -301,9 +302,9 @@ static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
   /* Work out the average size of a frame within this layer */
   if (layer > 0) {
     lc->avg_frame_size_for_layer =
-        (int)((cpi->oxcf.target_bitrate[layer] -
-               cpi->oxcf.target_bitrate[layer - 1]) *
-              1000 / (lc->framerate - prev_layer_framerate));
+        (int)round((cpi->oxcf.target_bitrate[layer] -
+                    cpi->oxcf.target_bitrate[layer - 1]) *
+                   1000 / (lc->framerate - prev_layer_framerate));
   }
 
   lc->active_worst_quality = cpi->oxcf.worst_allowed_q;
@@ -327,8 +328,8 @@ static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
 // for any "new" layers. For "existing" layers, let them inherit the parameters
 // from the previous layer state (at the same layer #). In future we may want
 // to better map the previous layer state(s) to the "new" ones.
-static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
-                                        const int prev_num_layers) {
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
+                                     const int prev_num_layers) {
   int i;
   double prev_layer_framerate = 0;
   const int curr_num_layers = cpi->oxcf.number_of_layers;
@@ -336,12 +337,12 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
   // We need this to set the layer context for the new layers below.
   if (prev_num_layers == 1) {
     cpi->current_layer = 0;
-    save_layer_context(cpi);
+    vp8_save_layer_context(cpi);
   }
   for (i = 0; i < curr_num_layers; ++i) {
     LAYER_CONTEXT *lc = &cpi->layer_context[i];
     if (i >= prev_num_layers) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
     }
     // The initial buffer levels are set based on their starting levels.
     // We could set the buffer levels based on the previous state (normalized
@@ -356,7 +357,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
     // state (to smooth-out quality dips/rate fluctuation at transition)?
 
     // We need to treat the 1 layer case separately: oxcf.target_bitrate[i]
-    // is not set for 1 layer, and the restore_layer_context/save_context()
+    // is not set for 1 layer, and the vp8_restore_layer_context/save_context()
     // are not called in the encoding loop, so we need to call it here to
     // pass the layer context state to |cpi|.
     if (curr_num_layers == 1) {
@@ -364,7 +365,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
       lc->buffer_level =
           cpi->oxcf.starting_buffer_level_in_ms * lc->target_bandwidth / 1000;
       lc->bits_off_target = lc->buffer_level;
-      restore_layer_context(cpi, 0);
+      vp8_restore_layer_context(cpi, 0);
     }
     prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i];
   }
@@ -394,16 +395,13 @@ static void setup_features(VP8_COMP *cpi) {
 
 static void dealloc_raw_frame_buffers(VP8_COMP *cpi);
 
-void vp8_initialize_enc(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    vpx_dsp_rtcd();
-    vp8_init_intra_predictors();
-    init_done = 1;
-  }
+static void initialize_enc(void) {
+  vpx_dsp_rtcd();
+  vp8_init_intra_predictors();
 }
 
+void vp8_initialize_enc(void) { once(initialize_enc); }
+
 static void dealloc_compressor_data(VP8_COMP *cpi) {
   vpx_free(cpi->tplist);
   cpi->tplist = NULL;
@@ -444,11 +442,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) {
 
   vpx_free(cpi->mb.pip);
   cpi->mb.pip = 0;
-
-#if CONFIG_MULTITHREAD
-  vpx_free(cpi->mt_current_mb_col);
-  cpi->mt_current_mb_col = NULL;
-#endif
 }
 
 static void enable_segmentation(VP8_COMP *cpi) {
@@ -490,7 +483,7 @@ static void set_segmentation_map(VP8_COMP *cpi,
  */
 static void set_segment_data(VP8_COMP *cpi, signed char *feature_data,
                              unsigned char abs_delta) {
-  cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
+  cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
   memcpy(cpi->segment_feature_data, feature_data,
          sizeof(cpi->segment_feature_data));
 }
@@ -1023,7 +1016,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
 
       memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
 
-  }; /* switch */
+  } /* switch */
 
   /* Slow quant, dct and trellis not worthwhile for first pass
    * so make sure they are always turned off.
@@ -1171,7 +1164,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
 #else
     unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
 #endif
-    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->tok,
+                    vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
   /* Data used for real time vc mode to see if gf needs refreshing */
@@ -1180,37 +1174,39 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   /* Structures used to monitor GF usage */
   vpx_free(cpi->gf_active_flags);
   CHECK_MEM_ERROR(
-      cpi->gf_active_flags,
+      &cpi->common.error, cpi->gf_active_flags,
       vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols));
   cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
   vpx_free(cpi->mb_activity_map);
   CHECK_MEM_ERROR(
-      cpi->mb_activity_map,
+      &cpi->common.error, cpi->mb_activity_map,
       vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols));
 
   /* allocate memory for storing last frame's MVs for MV prediction. */
   vpx_free(cpi->lfmv);
-  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
-                                        sizeof(*cpi->lfmv)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->lfmv,
+      vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv)));
   vpx_free(cpi->lf_ref_frame_sign_bias);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame_sign_bias)));
   vpx_free(cpi->lf_ref_frame);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame)));
 
   /* Create the encoder segmentation map and set all entries to 0 */
   vpx_free(cpi->segmentation_map);
   CHECK_MEM_ERROR(
-      cpi->segmentation_map,
+      &cpi->common.error, cpi->segmentation_map,
       vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map)));
   cpi->cyclic_refresh_mode_index = 0;
   vpx_free(cpi->active_map);
-  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
-                                              sizeof(*cpi->active_map)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->active_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map)));
   memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols));
 
 #if CONFIG_MULTITHREAD
@@ -1223,21 +1219,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   } else {
     cpi->mt_sync_range = 16;
   }
-
-  if (cpi->oxcf.multi_threaded > 1) {
-    int i;
-
-    vpx_free(cpi->mt_current_mb_col);
-    CHECK_MEM_ERROR(cpi->mt_current_mb_col,
-                    vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
-    for (i = 0; i < cm->mb_rows; ++i)
-      vpx_atomic_init(&cpi->mt_current_mb_col[i], 0);
-  }
-
 #endif
 
   vpx_free(cpi->tplist);
-  CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist,
+                  vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
 
 #if CONFIG_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1274,7 +1260,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) {
   cpi->framerate = framerate;
   cpi->output_framerate = framerate;
   cpi->per_frame_bandwidth =
-      (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
+      (int)round(cpi->oxcf.target_bandwidth / cpi->output_framerate);
   cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
   cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
                                    cpi->oxcf.two_pass_vbrmin_section / 100);
@@ -1304,7 +1290,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) {
   }
 }
 
-static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
+static void init_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
   VP8_COMMON *cm = &cpi->common;
 
   cpi->oxcf = *oxcf;
@@ -1365,7 +1351,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
     double prev_layer_framerate = 0;
 
     for (i = 0; i < cpi->oxcf.number_of_layers; ++i) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
       prev_layer_framerate =
           cpi->output_framerate / cpi->oxcf.rate_decimator[i];
     }
@@ -1382,7 +1368,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 #endif
 }
 
-static void update_layer_contexts(VP8_COMP *cpi) {
+void vp8_update_layer_contexts(VP8_COMP *cpi) {
   VP8_CONFIG *oxcf = &cpi->oxcf;
 
   /* Update snapshots of the layer contexts to reflect new parameters */
@@ -1417,8 +1403,8 @@ static void update_layer_contexts(VP8_COMP *cpi) {
       /* Work out the average size of a frame within this layer */
       if (i > 0) {
         lc->avg_frame_size_for_layer =
-            (int)((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
-                  1000 / (lc->framerate - prev_layer_framerate));
+            (int)round((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
+                       1000 / (lc->framerate - prev_layer_framerate));
       }
 
       prev_layer_framerate = lc->framerate;
@@ -1426,11 +1412,11 @@ static void update_layer_contexts(VP8_COMP *cpi) {
   }
 }
 
-void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
+void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
   VP8_COMMON *cm = &cpi->common;
   int last_w, last_h;
   unsigned int prev_number_of_layers;
-  unsigned int raw_target_rate;
+  double raw_target_rate;
 
   if (!cpi) return;
 
@@ -1571,10 +1557,10 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
     cpi->oxcf.maximum_buffer_size_in_ms = 240000;
   }
 
-  raw_target_rate = (unsigned int)((int64_t)cpi->oxcf.Width * cpi->oxcf.Height *
-                                   8 * 3 * cpi->framerate / 1000);
+  raw_target_rate = ((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * 8 * 3 *
+                     cpi->framerate / 1000.0);
   if (cpi->oxcf.target_bandwidth > raw_target_rate)
-    cpi->oxcf.target_bandwidth = raw_target_rate;
+    cpi->oxcf.target_bandwidth = (unsigned int)raw_target_rate;
   /* Convert target bandwidth from Kbit/s to Bit/s */
   cpi->oxcf.target_bandwidth *= 1000;
 
@@ -1645,7 +1631,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
       cpi->temporal_layer_id = 0;
     }
     cpi->temporal_pattern_counter = 0;
-    reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
+    vp8_reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
   }
 
   if (!cpi->initial_width) {
@@ -1669,7 +1655,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 
   cm->sharpness_level = cpi->oxcf.Sharpness;
 
-  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
+  if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) {
     int hr, hs, vr, vs;
 
     Scale2Ratio(cm->horiz_scale, &hr, &hs);
@@ -1753,7 +1739,7 @@ static void cal_mvsadcosts(int *mvsadcost[2]) {
   } while (++i <= mvfp_max);
 }
 
-struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
+struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
   int i;
 
   VP8_COMP *cpi;
@@ -1775,8 +1761,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   cpi->common.error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site),
-                                         (MAX_MVSEARCH_STEPS * 8) + 1));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->mb.ss,
+      vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
 
   vp8_create_common(&cpi->common);
 
@@ -1881,18 +1868,19 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   }
 
   if (cpi->cyclic_refresh_mode_enabled) {
-    CHECK_MEM_ERROR(cpi->cyclic_refresh_map,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map,
                     vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
   } else {
     cpi->cyclic_refresh_map = (signed char *)NULL;
   }
 
-  CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
-                                            sizeof(cpi->skin_map[0])));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->skin_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0])));
 
-  CHECK_MEM_ERROR(cpi->consec_zero_last,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last,
                   vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
-  CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias,
                   vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
 
   /*Initialize the feed-forward activity masking.*/
@@ -1910,6 +1898,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   cpi->force_maxqp = 0;
   cpi->frames_since_last_drop_overshoot = 0;
+  cpi->rt_always_update_correction_factor = 0;
+  cpi->rt_drop_recode_on_overshoot = 1;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -2013,36 +2003,26 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
   cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
   cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
-  cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
-  cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
   cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
 
   cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
   cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
   cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
-  cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
-  cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
   cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
 
   cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
   cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
   cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
-  cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
-  cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
   cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
 
   cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
   cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
   cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
-  cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
-  cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
   cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
 
   cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
   cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
   cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
-  cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
-  cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
   cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;
 
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
@@ -2053,7 +2033,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->fn_ptr[BLOCK_4X4].copymem = vp8_copy32xn;
 #endif
 
-  cpi->full_search_sad = vp8_full_search_sad;
   cpi->diamond_search_sad = vp8_diamond_search_sad;
   cpi->refining_search_sad = vp8_refining_search_sad;
 
@@ -2121,7 +2100,6 @@ void vp8_remove_compressor(VP8_COMP **comp) {
       double time_encoded =
           (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
           10000000.000;
-      double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
 
       if (cpi->b_calculate_psnr) {
         if (cpi->oxcf.number_of_layers > 1) {
@@ -2150,6 +2128,7 @@ void vp8_remove_compressor(VP8_COMP **comp) {
                     total_psnr2, total_ssim);
           }
         } else {
+          double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
           double samples =
               3.0 / 2 * cpi->count * cpi->common.Width * cpi->common.Height;
           double total_psnr =
@@ -2257,7 +2236,7 @@ void vp8_remove_compressor(VP8_COMP **comp) {
 #if 0
         {
             printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-            printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+            printf("\n_frames receive_data encod_mb_row compress_frame  Total\n");
             printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
         }
 #endif
@@ -2516,15 +2495,17 @@ static int resize_key_frame(VP8_COMP *cpi) {
     if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark *
                              cpi->oxcf.optimal_buffer_level / 100)) {
       cm->horiz_scale =
-          (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
-      cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+          (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO;
+      cm->vert_scale =
+          (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO;
     }
     /* Should we now start scaling back up */
     else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark *
                                   cpi->oxcf.optimal_buffer_level / 100)) {
       cm->horiz_scale =
-          (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
-      cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+          (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL;
+      cm->vert_scale =
+          (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL;
     }
 
     /* Get the new height and width */
@@ -2755,7 +2736,7 @@ static int decide_key_frame(VP8_COMP *cpi) {
   }
   /* in addition if the following are true and this is not a golden frame
    * then code a key frame Note that on golden frames there often seems
-   * to be a pop in intra useage anyway hence this restriction is
+   * to be a pop in intra usage anyway hence this restriction is
    * designed to prevent spurious key frames. The Intra pop needs to be
    * investigated.
    */
@@ -3203,6 +3184,113 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
 
   vp8_yv12_extend_frame_borders(cm->frame_to_show);
 }
+// Return 1 if frame is to be dropped. Update frame drop decimation
+// counters.
+int vp8_check_drop_buffer(VP8_COMP *cpi) {
+  VP8_COMMON *cm = &cpi->common;
+  int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
+                        cpi->oxcf.optimal_buffer_level / 100);
+  int drop_mark75 = drop_mark * 2 / 3;
+  int drop_mark50 = drop_mark / 4;
+  int drop_mark25 = drop_mark / 8;
+  if (cpi->drop_frames_allowed) {
+    /* The reset to decimation 0 is only done here for one pass.
+     * Once it is set two pass leaves decimation on till the next kf.
+     */
+    if (cpi->buffer_level > drop_mark && cpi->decimation_factor > 0) {
+      cpi->decimation_factor--;
+    }
+
+    if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) {
+      cpi->decimation_factor = 1;
+
+    } else if (cpi->buffer_level < drop_mark25 &&
+               (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) {
+      cpi->decimation_factor = 3;
+    } else if (cpi->buffer_level < drop_mark50 &&
+               (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) {
+      cpi->decimation_factor = 2;
+    } else if (cpi->buffer_level < drop_mark75 &&
+               (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) {
+      cpi->decimation_factor = 1;
+    }
+  }
+
+  /* The following decimates the frame rate according to a regular
+   * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
+   * prevent buffer under-run in CBR mode. Alternatively it might be
+   * desirable in some situations to drop frame rate but throw more bits
+   * at each frame.
+   *
+   * Note that dropping a key frame can be problematic if spatial
+   * resampling is also active
+   */
+  if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) {
+    switch (cpi->decimation_factor) {
+      case 1:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
+        break;
+      case 2:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+        break;
+      case 3:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+        break;
+    }
+
+    /* Note that we should not throw out a key frame (especially when
+     * spatial resampling is enabled).
+     */
+    if (cm->frame_type == KEY_FRAME) {
+      cpi->decimation_count = cpi->decimation_factor;
+    } else if (cpi->decimation_count > 0) {
+      cpi->decimation_count--;
+
+      cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+      if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
+        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+      }
+
+#if CONFIG_MULTI_RES_ENCODING
+      vp8_store_drop_frame_info(cpi);
+#endif
+
+      cm->current_video_frame++;
+      cpi->frames_since_key++;
+      cpi->ext_refresh_frame_flags_pending = 0;
+      // We advance the temporal pattern for dropped frames.
+      cpi->temporal_pattern_counter++;
+
+#if CONFIG_INTERNAL_STATS
+      cpi->count++;
+#endif
+
+      cpi->buffer_level = cpi->bits_off_target;
+
+      if (cpi->oxcf.number_of_layers > 1) {
+        unsigned int i;
+
+        /* Propagate bits saved by dropping the frame to higher
+         * layers
+         */
+        for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
+          LAYER_CONTEXT *lc = &cpi->layer_context[i];
+          lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
+          if (lc->bits_off_target > lc->maximum_buffer_size) {
+            lc->bits_off_target = lc->maximum_buffer_size;
+          }
+          lc->buffer_level = lc->bits_off_target;
+        }
+      }
+      return 1;
+    } else {
+      cpi->decimation_count = cpi->decimation_factor;
+    }
+  } else {
+    cpi->decimation_count = 0;
+  }
+  return 0;
+}
 
 static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                                       unsigned char *dest,
@@ -3213,7 +3301,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   int frame_under_shoot_limit;
 
   int Loop = 0;
-  int loop_count;
 
   VP8_COMMON *cm = &cpi->common;
   int active_worst_qchanged = 0;
@@ -3229,12 +3316,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   int undershoot_seen = 0;
 #endif
 
-  int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
-                        cpi->oxcf.optimal_buffer_level / 100);
-  int drop_mark75 = drop_mark * 2 / 3;
-  int drop_mark50 = drop_mark / 4;
-  int drop_mark25 = drop_mark / 8;
-
   /* Clear down mmx registers to allow floating point in what follows */
   vpx_clear_system_state();
 
@@ -3260,7 +3341,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif  // !CONFIG_REALTIME_ONLY
     default:
       cpi->per_frame_bandwidth =
-          (int)(cpi->target_bandwidth / cpi->output_framerate);
+          (int)round(cpi->target_bandwidth / cpi->output_framerate);
       break;
   }
 
@@ -3448,102 +3529,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
   update_rd_ref_frame_probs(cpi);
 
-  if (cpi->drop_frames_allowed) {
-    /* The reset to decimation 0 is only done here for one pass.
-     * Once it is set two pass leaves decimation on till the next kf.
-     */
-    if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0)) {
-      cpi->decimation_factor--;
-    }
-
-    if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) {
-      cpi->decimation_factor = 1;
-
-    } else if (cpi->buffer_level < drop_mark25 &&
-               (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) {
-      cpi->decimation_factor = 3;
-    } else if (cpi->buffer_level < drop_mark50 &&
-               (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) {
-      cpi->decimation_factor = 2;
-    } else if (cpi->buffer_level < drop_mark75 &&
-               (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) {
-      cpi->decimation_factor = 1;
-    }
-  }
-
-  /* The following decimates the frame rate according to a regular
-   * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
-   * prevent buffer under-run in CBR mode. Alternatively it might be
-   * desirable in some situations to drop frame rate but throw more bits
-   * at each frame.
-   *
-   * Note that dropping a key frame can be problematic if spatial
-   * resampling is also active
-   */
-  if (cpi->decimation_factor > 0) {
-    switch (cpi->decimation_factor) {
-      case 1:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
-        break;
-      case 2:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
-        break;
-      case 3:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
-        break;
-    }
-
-    /* Note that we should not throw out a key frame (especially when
-     * spatial resampling is enabled).
-     */
-    if (cm->frame_type == KEY_FRAME) {
-      cpi->decimation_count = cpi->decimation_factor;
-    } else if (cpi->decimation_count > 0) {
-      cpi->decimation_count--;
-
-      cpi->bits_off_target += cpi->av_per_frame_bandwidth;
-      if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
-        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
-      }
-
-#if CONFIG_MULTI_RES_ENCODING
-      vp8_store_drop_frame_info(cpi);
-#endif
-
-      cm->current_video_frame++;
-      cpi->frames_since_key++;
-      cpi->ext_refresh_frame_flags_pending = 0;
-      // We advance the temporal pattern for dropped frames.
-      cpi->temporal_pattern_counter++;
-
-#if CONFIG_INTERNAL_STATS
-      cpi->count++;
-#endif
-
-      cpi->buffer_level = cpi->bits_off_target;
-
-      if (cpi->oxcf.number_of_layers > 1) {
-        unsigned int i;
-
-        /* Propagate bits saved by dropping the frame to higher
-         * layers
-         */
-        for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
-          LAYER_CONTEXT *lc = &cpi->layer_context[i];
-          lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
-          if (lc->bits_off_target > lc->maximum_buffer_size) {
-            lc->bits_off_target = lc->maximum_buffer_size;
-          }
-          lc->buffer_level = lc->bits_off_target;
-        }
-      }
-
-      return;
-    } else {
-      cpi->decimation_count = cpi->decimation_factor;
-    }
-  } else {
-    cpi->decimation_count = 0;
+  if (vp8_check_drop_buffer(cpi)) {
+    return;
   }
 
   /* Decide how big to make the frame */
@@ -3642,7 +3629,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         Q = cpi->avg_frame_qindex;
       }
 
-      /* For constrained quality dont allow Q less than the cq level */
+      /* For constrained quality don't allow Q less than the cq level */
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
           (Q < cpi->cq_target_quality)) {
         Q = cpi->cq_target_quality;
@@ -3669,7 +3656,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     } else {
       cpi->active_best_quality = inter_minq[Q];
 
-      /* For the constant/constrained quality mode we dont want
+      /* For the constant/constrained quality mode we don't want
        * q to fall below the cq level.
        */
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
@@ -3690,7 +3677,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
      * higher quality on the frames to prevent bits just going to waste.
      */
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-      /* Note that the use of >= here elliminates the risk of a devide
+      /* Note that the use of >= here elliminates the risk of a divide
        * by 0 error in the else if clause
        */
       if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) {
@@ -3779,8 +3766,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
   vp8_save_coding_context(cpi);
 
-  loop_count = 0;
-
   scale_and_extend_source(cpi->un_scaled_source, cpi);
 
 #if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC
@@ -3953,7 +3938,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     /* transform / motion compensation build reconstruction frame */
     vp8_encode_frame(cpi);
 
-    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+        cpi->rt_drop_recode_on_overshoot == 1) {
       if (vp8_drop_encodedframe_overshoot(cpi, Q)) {
         vpx_clear_system_state();
         return;
@@ -4003,7 +3989,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         q_low = cpi->active_best_quality;
         q_high = cpi->active_worst_quality;
 
-        loop_count++;
         Loop = 1;
 
         continue;
@@ -4016,7 +4001,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
     /* Are we are overshooting and up against the limit of active max Q. */
-    if (((cpi->pass != 2) ||
+    if (!cpi->rt_always_update_correction_factor &&
+        ((cpi->pass != 2) ||
          (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
         (Q == cpi->active_worst_quality) &&
         (cpi->active_worst_quality < cpi->worst_quality) &&
@@ -4213,11 +4199,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
       }
 
       /* Clamp cpi->zbin_over_quant */
-      cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low)
-                                    ? zbin_oq_low
-                                    : (cpi->mb.zbin_over_quant > zbin_oq_high)
-                                          ? zbin_oq_high
-                                          : cpi->mb.zbin_over_quant;
+      cpi->mb.zbin_over_quant =
+          (cpi->mb.zbin_over_quant < zbin_oq_low)    ? zbin_oq_low
+          : (cpi->mb.zbin_over_quant > zbin_oq_high) ? zbin_oq_high
+                                                     : cpi->mb.zbin_over_quant;
 
       Loop = Q != last_q;
     } else {
@@ -4229,7 +4214,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
     if (Loop == 1) {
       vp8_restore_coding_context(cpi);
-      loop_count++;
 #if CONFIG_INTERNAL_STATS
       cpi->tot_recode_hits++;
 #endif
@@ -4331,12 +4315,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   vp8_cal_dissimilarity(cpi);
 #endif
 
-  /* Update the GF useage maps.
+  /* Update the GF usage maps.
    * This is done after completing the compression of a frame when all
    * modes etc. are finalized but before loop filter
    */
   if (cpi->oxcf.number_of_layers == 1) {
-    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+    vp8_update_gf_usage_maps(cpi, cm, &cpi->mb);
   }
 
   if (cm->frame_type == KEY_FRAME) cm->refresh_last_frame = 1;
@@ -4493,7 +4477,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
        * size within range) then use the last frame value - 1. The -1
        * is designed to stop Q and hence the data rate, from
        * progressively falling away during difficult sections, but at
-       * the same time reduce the number of itterations around the
+       * the same time reduce the number of iterations around the
        * recode loop.
        */
       if (Q > cpi->ni_av_qi) cpi->ni_av_qi = Q - 1;
@@ -4514,10 +4498,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
   }
 
-  // If the frame dropper is not enabled, don't let the buffer level go below
-  // some threshold, given here by -|maximum_buffer_size|. For now we only do
-  // this for screen content input.
-  if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode &&
+  // Don't let the buffer level go below some threshold, given here
+  // by -|maximum_buffer_size|. For now we only do this for
+  // screen content input.
+  if (cpi->oxcf.screen_content_mode &&
       cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size) {
     cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size;
   }
@@ -4552,8 +4536,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
     for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
       LAYER_CONTEXT *lc = &cpi->layer_context[i];
-      int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
-                                          cpi->projected_frame_size);
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi->projected_frame_size);
 
       lc->bits_off_target += bits_off_for_this_layer;
 
@@ -4740,7 +4724,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   cpi->mb.e_mbd.update_mb_segmentation_data = 0;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
 
-  /* Dont increment frame counters if this was an altref buffer update
+  /* Don't increment frame counters if this was an altref buffer update
    * not a real frame
    */
   if (cm->show_frame) {
@@ -4919,6 +4903,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
       this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
       last_duration = cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+      // Cap this to avoid overflow of (this_duration - last_duration) * 10
+      this_duration = VPXMIN(this_duration, INT64_MAX / 10);
       /* do a step update if the duration changes by 10% */
       if (last_duration) {
         step = (int)(((this_duration - last_duration) * 10 / last_duration));
@@ -4988,7 +4974,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   if (cpi->oxcf.number_of_layers > 1) {
     int layer;
 
-    update_layer_contexts(cpi);
+    vp8_update_layer_contexts(cpi);
 
     /* Restore layer specific context & set frame rate */
     if (cpi->temporal_layer_id >= 0) {
@@ -4998,7 +4984,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
           cpi->oxcf
               .layer_id[cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
     }
-    restore_layer_context(cpi, layer);
+    vp8_restore_layer_context(cpi, layer);
     vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
   }
 
@@ -5116,7 +5102,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
   if (cm->refresh_last_frame) memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
 
-  /* if its a dropped frame honor the requests on subsequent frames */
+  /* if it's a dropped frame honor the requests on subsequent frames */
   if (*size > 0) {
     cpi->droppable = !frame_is_reference(cpi);
 
@@ -5129,7 +5115,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   }
 
   /* Save layer specific state */
-  if (cpi->oxcf.number_of_layers > 1) save_layer_context(cpi);
+  if (cpi->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi);
 
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
@@ -5316,17 +5302,13 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
     return -1;
   }
 
-  // Range check the delta Q values and convert the external Q range values
-  // to internal ones.
-  if ((abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) ||
-      (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range)) {
-    return -1;
-  }
-
-  // Range check the delta lf values
-  if ((abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) ||
-      (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range)) {
-    return -1;
+  for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (delta_q[i] > range || delta_q[i] < -range || delta_lf[i] > range ||
+        delta_lf[i] < -range) {
+      return -1;
+    }
   }
 
   // Also disable segmentation if no deltas are specified.
@@ -5394,15 +5376,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   }
 }
 
-int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode) {
-  if (horiz_mode <= ONETWO) {
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode) {
+  if (horiz_mode <= VP8E_ONETWO) {
     cpi->common.horiz_scale = horiz_mode;
   } else {
     return -1;
   }
 
-  if (vert_mode <= ONETWO) {
+  if (vert_mode <= VP8E_ONETWO) {
     cpi->common.vert_scale = vert_mode;
   } else {
     return -1;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index b96f9b1dc5..1451a27812 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -11,7 +11,9 @@
 #ifndef VPX_VP8_ENCODER_ONYX_INT_H_
 #define VPX_VP8_ENCODER_ONYX_INT_H_
 
+#include <assert.h>
 #include <stdio.h>
+
 #include "vpx_config.h"
 #include "vp8/common/onyx.h"
 #include "treewriter.h"
@@ -213,7 +215,7 @@ enum {
 typedef struct {
   /* Layer configuration */
   double framerate;
-  int target_bandwidth;
+  int target_bandwidth; /* bits per second */
 
   /* Layer specific coding parameters */
   int64_t starting_buffer_level;
@@ -358,7 +360,7 @@ typedef struct VP8_COMP {
   /* GF interval chosen when we coded the last GF */
   int current_gf_interval;
 
-  /* Total bits overspent becasue of GF boost (cumulative) */
+  /* Total bits overspent because of GF boost (cumulative) */
   int gf_overspend_bits;
 
   /* Used in the few frames following a GF to recover the extra bits
@@ -436,7 +438,7 @@ typedef struct VP8_COMP {
   int kf_boost;
   int last_boost;
 
-  int target_bandwidth;
+  int target_bandwidth; /* bits per second */
   struct vpx_codec_pkt_list *output_pkt_list;
 
 #if 0
@@ -483,7 +485,7 @@ typedef struct VP8_COMP {
 
   unsigned char *segmentation_map;
   signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
-  int segment_encode_breakout[MAX_MB_SEGMENTS];
+  unsigned int segment_encode_breakout[MAX_MB_SEGMENTS];
 
   unsigned char *active_map;
   unsigned int active_map_enabled;
@@ -524,6 +526,7 @@ typedef struct VP8_COMP {
 #if CONFIG_MULTITHREAD
   /* multithread data */
   vpx_atomic_int *mt_current_mb_col;
+  int mt_current_mb_col_size;
   int mt_sync_range;
   vpx_atomic_int b_multi_threaded;
   int encoding_thread_count;
@@ -549,7 +552,6 @@ typedef struct VP8_COMP {
   unsigned char *partition_d_end[MAX_PARTITIONS];
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  vp8_full_search_fn_t full_search_sad;
   vp8_refining_search_fn_t refining_search_sad;
   vp8_diamond_search_fn_t diamond_search_sad;
   vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
@@ -702,12 +704,28 @@ typedef struct VP8_COMP {
   int use_roi_static_threshold;
 
   int ext_refresh_frame_flags_pending;
+
+  // Always update correction factor used for rate control after each frame for
+  // realtime encoding.
+  int rt_always_update_correction_factor;
+
+  // Flag to indicate frame may be dropped due to large expected overshoot,
+  // and re-encoded on next frame at max_qp.
+  int rt_drop_recode_on_overshoot;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
+                                     const int prev_num_layers);
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate);
+void vp8_update_layer_contexts(VP8_COMP *cpi);
+void vp8_save_layer_context(VP8_COMP *cpi);
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer);
 void vp8_new_framerate(VP8_COMP *cpi, double framerate);
 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
 
@@ -718,24 +736,8 @@ void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
 
 void vp8_set_speed_features(VP8_COMP *cpi);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    (lval) = (expr);                                              \
-    if (!(lval))                                                  \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
+int vp8_check_drop_buffer(VP8_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 04f68c3245..1af8a2f9b2 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -1103,7 +1103,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity) {
       /* Store for later use by denoiser. */
-      // Dont' denoise with GOLDEN OR ALTREF is they are old reference
+      // Don't denoise with GOLDEN OR ALTREF is they are old reference
       // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
       int skip_old_reference = ((this_ref_frame != LAST_FRAME) &&
                                 (cpi->common.current_video_frame -
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index ba124c359e..fcd4eb04eb 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -314,7 +314,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
      * bandwidth per second * fraction of the initial buffer
      * level
      */
-    target = cpi->oxcf.starting_buffer_level / 2;
+    target = (uint64_t)cpi->oxcf.starting_buffer_level / 2;
 
     if (target > cpi->oxcf.target_bandwidth * 3 / 2) {
       target = cpi->oxcf.target_bandwidth * 3 / 2;
@@ -327,7 +327,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
     int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
     /* Boost depends somewhat on frame rate: only used for 1 layer case. */
     if (cpi->oxcf.number_of_layers == 1) {
-      kf_boost = VPXMAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
+      kf_boost =
+          VPXMAX(initial_boost, (int)round(2 * cpi->output_framerate - 16));
     } else {
       /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
       kf_boost = initial_boost;
@@ -345,12 +346,17 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
     /* Minimal target size is |2* per_frame_bandwidth|. */
     if (kf_boost < 16) kf_boost = 16;
 
-    target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    target = ((uint64_t)(16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    target = VPXMIN(INT_MAX, target);
   }
 
   if (cpi->oxcf.rc_max_intra_bitrate_pct) {
-    unsigned int max_rate =
-        cpi->per_frame_bandwidth * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+    unsigned int max_rate;
+    // This product may overflow unsigned int
+    uint64_t product = cpi->per_frame_bandwidth;
+    product *= cpi->oxcf.rc_max_intra_bitrate_pct;
+    product /= 100;
+    max_rate = (unsigned int)VPXMIN(INT_MAX, product);
 
     if (target > max_rate) target = max_rate;
   }
@@ -383,7 +389,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
       (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
   int Boost = 0;
 
-  int gf_frame_useage = 0; /* Golden frame useage since last GF */
+  int gf_frame_usage = 0; /* Golden frame usage since last GF */
   int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
                 cpi->recent_ref_frame_usage[LAST_FRAME] +
                 cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -393,12 +399,12 @@ static void calc_gf_params(VP8_COMP *cpi) {
                       (cpi->common.mb_rows * cpi->common.mb_cols);
 
   if (tot_mbs) {
-    gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                       cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                      100 / tot_mbs;
+    gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                      cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                     100 / tot_mbs;
   }
 
-  if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+  if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active;
 
   /* Not two pass */
   if (cpi->pass != 2) {
@@ -462,7 +468,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
       /* Adjust boost based upon ambient Q */
       Boost = GFQ_ADJUSTMENT;
 
-      /* Adjust based upon most recently measure intra useage */
+      /* Adjust based upon most recently measure intra usage */
       Boost = Boost *
               gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15)
                                             ? cpi->this_frame_percent_intra
@@ -470,7 +476,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
               100;
 
       /* Adjust gf boost based upon GF usage since last GF */
-      Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
+      Boost = Boost * gf_adjust_table[gf_frame_usage] / 100;
 #endif
     }
 
@@ -511,8 +517,8 @@ static void calc_gf_params(VP8_COMP *cpi) {
 
       if (cpi->last_boost >= 1500) cpi->frames_till_gf_update_due++;
 
-      if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due) {
-        cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage];
+      if (gf_interval_table[gf_frame_usage] > cpi->frames_till_gf_update_due) {
+        cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_usage];
       }
 
       if (cpi->frames_till_gf_update_due > cpi->max_gf_interval) {
@@ -713,7 +719,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
         }
 
         /* lower the target bandwidth for this frame. */
-        cpi->this_frame_target -= (cpi->this_frame_target * percent_low) / 200;
+        cpi->this_frame_target -=
+            (int)(((int64_t)cpi->this_frame_target * percent_low) / 200);
 
         /* Are we using allowing control of active_worst_allowed_q
          * according to buffer level.
@@ -776,6 +783,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
         }
       } else {
         int percent_high = 0;
+        int64_t target = cpi->this_frame_target;
 
         if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
             (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)) {
@@ -793,7 +801,9 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
           percent_high = 0;
         }
 
-        cpi->this_frame_target += (cpi->this_frame_target * percent_high) / 200;
+        target += (target * percent_high) / 200;
+        target = VPXMIN(target, INT_MAX);
+        cpi->this_frame_target = (int)target;
 
         /* Are we allowing control of active_worst_allowed_q according
          * to buffer level.
@@ -887,7 +897,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
       int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]
                                       : cpi->oxcf.fixed_q;
 
-      int gf_frame_useage = 0; /* Golden frame useage since last GF */
+      int gf_frame_usage = 0; /* Golden frame usage since last GF */
       int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
                     cpi->recent_ref_frame_usage[LAST_FRAME] +
                     cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -897,20 +907,20 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
                           (cpi->common.mb_rows * cpi->common.mb_cols);
 
       if (tot_mbs) {
-        gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                           cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                          100 / tot_mbs;
+        gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                          cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                         100 / tot_mbs;
       }
 
-      if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+      if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active;
 
       /* Is a fixed manual GF frequency being used */
       if (cpi->auto_gold) {
-        /* For one pass throw a GF if recent frame intra useage is
-         * low or the GF useage is high
+        /* For one pass throw a GF if recent frame intra usage is
+         * low or the GF usage is high
          */
         if ((cpi->pass == 0) &&
-            (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) {
+            (cpi->this_frame_percent_intra < 15 || gf_frame_usage >= 5)) {
           cpi->common.refresh_golden_frame = 1;
 
           /* Two pass GF descision */
@@ -925,10 +935,10 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
           if (0) {
               FILE *f;
 
-              f = fopen("gf_useaget.stt", "a");
+              f = fopen("gf_usaget.stt", "a");
               fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
                       cpi->common.current_video_frame,  cpi->gfu_boost,
-                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_usage);
               fclose(f);
           }
 
@@ -1074,8 +1084,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
 
   /* Work out a size correction factor. */
   if (projected_size_based_on_q > 0) {
-    correction_factor =
-        (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+    correction_factor = (int)((100 * (int64_t)cpi->projected_frame_size) /
+                              projected_size_based_on_q);
   }
 
   /* More heavily damped adjustment used if we have been oscillating
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 79a858e437..5d539ef30c 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1021,7 +1021,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
         BLOCK *c;
         BLOCKD *e;
 
-        /* Is the best so far sufficiently good that we cant justify
+        /* Is the best so far sufficiently good that we can't justify
          * doing a new motion search.
          */
         if (best_label_rd < label_mv_thresh) break;
@@ -1097,8 +1097,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
             vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min,
                          x->mv_row_max);
 
-            thissme = cpi->full_search_sad(x, c, e, &mvp_full, sadpb, 16,
-                                           v_fn_ptr, x->mvcost, bsi->ref_mv);
+            thissme = vp8_full_search_sad(x, c, e, &mvp_full, sadpb, 16,
+                                          v_fn_ptr, x->mvcost, bsi->ref_mv);
 
             if (thissme < bestsme) {
               bestsme = thissme;
@@ -1608,7 +1608,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], RATE_DISTORTION *rd,
       unsigned int q2dc = xd->block[24].dequant[0];
       /* If theres is no codeable 2nd order dc
          or a very small uniform pixel change change */
-      if ((sse - var<q2dc * q2dc>> 4) || (sse / 2 > var && sse - var < 64)) {
+      if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) {
         /* Check u and v to make sure skip is ok */
         unsigned int sse2 = VP8_UVSSE(x);
         if (sse2 * 2 < threshold) {
@@ -1979,7 +1979,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.distortion2 += distortion;
 
         /* If even the 'Y' rd value of split is higher than best so far
-         * then dont bother looking at UV
+         * then don't bother looking at UV
          */
         if (tmp_rd < best_mode.yrd) {
           /* Now work out UV cost and add it in */
diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c
index dcb68119e1..2127258111 100644
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c
@@ -11,7 +11,7 @@
 #include "segmentation.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
+void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
   int mb_row, mb_col;
 
   MODE_INFO *this_mb_mode_info = cm->mi;
@@ -19,7 +19,7 @@ void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
   x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
 
   if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {
-    /* Reset Gf useage monitors */
+    /* Reset Gf usage monitors */
     memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
   } else {
diff --git a/vp8/encoder/segmentation.h b/vp8/encoder/segmentation.h
index 4ddbdbbd26..0fecfc2212 100644
--- a/vp8/encoder/segmentation.h
+++ b/vp8/encoder/segmentation.h
@@ -19,8 +19,8 @@
 extern "C" {
 #endif
 
-extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm,
-                                      MACROBLOCK *x);
+extern void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm,
+                                     MACROBLOCK *x);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/encoder/vp8_quantize.c b/vp8/encoder/vp8_quantize.c
index 5b89555108..8e5e318241 100644
--- a/vp8/encoder/vp8_quantize.c
+++ b/vp8/encoder/vp8_quantize.c
@@ -294,7 +294,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip) {
   /* Select the baseline MB Q index. */
   if (xd->segmentation_enabled) {
     /* Abs Value */
-    if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
       QIndex = xd->segment_feature_data[MB_LVL_ALT_Q]
                                        [xd->mode_info_context->mbmi.segment_id];
       /* Delta Value */
diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
index 89cad53356..f35b930169 100644
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -30,7 +30,7 @@ static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
       _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
   const __m128i hgfedcba =
       _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
-  unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba));
+  unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba));
 
   return sum_diff;
 }
diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c
index 389c16705d..4c2d24cc27 100644
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -11,28 +11,17 @@
 #include <smmintrin.h> /* SSE4.1 */
 
 #include "./vp8_rtcd.h"
-#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
 #include "vp8/encoder/block.h"
+#include "vpx_ports/bitops.h" /* get_lsb */
+#include "vpx_ports/compiler_attributes.h"
 
-#define SELECT_EOB(i, z, x, y, q)                         \
-  do {                                                    \
-    short boost = *zbin_boost_ptr;                        \
-    /* Technically _mm_extract_epi16() returns an int: */ \
-    /* https://bugs.llvm.org/show_bug.cgi?id=41657 */     \
-    short x_z = (short)_mm_extract_epi16(x, z);           \
-    short y_z = (short)_mm_extract_epi16(y, z);           \
-    int cmp = (x_z < boost) | (y_z == 0);                 \
-    zbin_boost_ptr++;                                     \
-    if (cmp) break;                                       \
-    q = _mm_insert_epi16(q, y_z, z);                      \
-    eob = i;                                              \
-    zbin_boost_ptr = b->zrun_zbin_boost;                  \
-  } while (0)
-
-void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
-  char eob = 0;
+// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask.
+VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b,
+                                                               BLOCKD *d) {
+  int eob = -1;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
-
+  __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr));
+  __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8));
   __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;
   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
@@ -47,8 +36,12 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
   __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
   __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
-  __m128i qcoeff0 = _mm_setzero_si128();
-  __m128i qcoeff1 = _mm_setzero_si128();
+  __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1;
+  uint32_t mask, ymask;
+  DECLARE_ALIGNED(16, static const uint8_t,
+                  zig_zag_mask[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                        9, 12, 13, 10, 7, 11, 14, 15 };
+  DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 };
 
   /* Duplicate to all lanes. */
   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
@@ -88,23 +81,52 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   y0 = _mm_sign_epi16(y0, z0);
   y1 = _mm_sign_epi16(y1, z1);
 
-  /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
-  SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+  {
+    const __m128i zig_zag_i16_0 =
+        _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13);
+    const __m128i zig_zag_i16_1 =
+        _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13);
+
+    /* The first part of the zig zag needs a value
+     * from x_minus_zbin1 and vice versa. */
+    t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2);
+    t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80);
+    t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80);
+    x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0);
+    x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1);
+  }
+
+  /* Check if y is nonzero and put it in zig zag order. */
+  t0 = _mm_packs_epi16(y0, y1);
+  t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128());
+  t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask));
+  ymask = _mm_movemask_epi8(t0) ^ 0xffff;
+
+  for (;;) {
+    t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0);
+    t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1);
+    t0 = _mm_packs_epi16(t0, t1);
+    mask = _mm_movemask_epi8(t0);
+    mask = ~mask & ymask;
+    if (!mask) break;
+    /* |eob| will contain the index of the next found element where:
+     * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */
+    eob = get_lsb(mask);
+    /* Need to clear the mask from processed elements so that
+     * they are no longer counted in the next iteration. */
+    ymask &= ~1U << eob;
+    /* It's safe to read ahead of this buffer if struct VP8_COMP has at
+     * least 32 bytes before the zrun_zbin_boost_* fields (it has 384).
+     * Any data read outside of the buffer is masked by the updated |ymask|. */
+    zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1));
+    zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7));
+    qcoeff[zig_zag_mask[eob]] = 0xffff;
+  }
+
+  qcoeff0 = _mm_load_si128((__m128i *)(qcoeff));
+  qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8));
+  qcoeff0 = _mm_and_si128(qcoeff0, y0);
+  qcoeff1 = _mm_and_si128(qcoeff1, y1);
 
   _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
   _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
@@ -115,5 +137,5 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
   _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
 
-  *d->eob = eob;
+  *d->eob = eob + 1;
 }
diff --git a/vp8/encoder/x86/vp8_quantize_ssse3.c b/vp8/encoder/x86/vp8_quantize_ssse3.c
index 147c30cc35..f6df146f08 100644
--- a/vp8/encoder/x86/vp8_quantize_ssse3.c
+++ b/vp8/encoder/x86/vp8_quantize_ssse3.c
@@ -12,31 +12,7 @@
 
 #include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
-
-/* bitscan reverse (bsr) */
-#if defined(_MSC_VER)
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-static int bsr(int mask) {
-  unsigned long eob;
-  _BitScanReverse(&eob, mask);
-  eob++;
-  if (mask == 0) eob = 0;
-  return eob;
-}
-#else
-static int bsr(int mask) {
-  int eob;
-#if defined(__GNUC__) && __GNUC__
-  __asm__ __volatile__("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags");
-#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-  asm volatile("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags");
-#endif
-  eob++;
-  if (mask == 0) eob = 0;
-  return eob;
-}
-#endif
+#include "vpx_ports/bitops.h" /* get_msb */
 
 void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
   int eob, mask;
@@ -108,7 +84,10 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
 
   mask = _mm_movemask_epi8(x);
 
-  eob = bsr(mask);
+  /* x2 is needed to increase the result from non-zero masks by 1,
+   * +1 is needed to mask undefined behavior for a null argument,
+   * the result of get_msb(1) is 0 */
+  eob = get_msb(mask * 2 + 1);
 
-  *d->eob = 0xFF & eob;
+  *d->eob = eob;
 }
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 286a93a056..d485965d3d 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -124,6 +124,11 @@ ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
 endif
 
+# common (loongarch LSX intrinsics)
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/loopfilter_filters_lsx.c
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/sixtap_filter_lsx.c
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/idct_lsx.c
+
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.h
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 1160f51d64..a10e08975c 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -12,14 +12,16 @@
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
-#include "vpx_ports/vpx_once.h"
 #include "vpx_util/vpx_timestamp.h"
+#if CONFIG_MULTITHREAD
+#include "vp8/encoder/ethreading.h"
+#endif
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
@@ -93,13 +95,16 @@ struct vpx_codec_alg_priv {
   vpx_enc_frame_flags_t control_frame_flags;
 };
 
+// Called by vp8e_set_config() and vp8e_encode() only. Must not be called
+// by vp8e_init() because the `error` paramerer (cpi->common.error) will be
+// destroyed by vpx_codec_enc_init_ver() after vp8e_init() returns an error.
+// See the "IMPORTANT" comment in vpx_codec_enc_init_ver().
 static vpx_codec_err_t update_error_state(
     vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) {
-  vpx_codec_err_t res;
+  const vpx_codec_err_t res = error->error_code;
 
-  if ((res = error->error_code)) {
+  if (res != VPX_CODEC_OK)
     ctx->base.err_detail = error->has_detail ? error->detail : NULL;
-  }
 
   return res;
 }
@@ -152,8 +157,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, g_lag_in_frames, 25);
 #endif
   RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
-  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000);
-  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
   RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
 
@@ -257,6 +262,23 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
     ERROR("g_threads cannot be bigger than number of token partitions");
 #endif
 
+  // The range below shall be further tuned.
+  RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
+  RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000);
+
   return VPX_CODEC_OK;
 }
 
@@ -322,7 +344,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
     oxcf->end_usage = USAGE_CONSTANT_QUALITY;
   }
 
-  oxcf->target_bandwidth = cfg.rc_target_bitrate;
+  // Cap the target rate to 1000 Mbps to avoid some integer overflows in
+  // target bandwidth calculations.
+  oxcf->target_bandwidth = VPXMIN(cfg.rc_target_bitrate, 1000000);
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
   oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct;
 
@@ -378,6 +402,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
 #endif
 
   oxcf->cpu_used = vp8_cfg.cpu_used;
+  if (cfg.g_pass == VPX_RC_FIRST_PASS) {
+    oxcf->cpu_used = VPXMAX(4, oxcf->cpu_used);
+  }
   oxcf->encode_breakout = vp8_cfg.static_thresh;
   oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
   oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
@@ -452,14 +479,26 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (!res) {
-    ctx->cfg = *cfg;
-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
-    vp8_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
 
-  return res;
+  ctx->cpi->common.error.setjmp = 1;
+  ctx->cfg = *cfg;
+  set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+  vp8_change_config(ctx->cpi, &ctx->oxcf);
+#if CONFIG_MULTITHREAD
+  if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR;
+#endif
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) {
@@ -585,6 +624,18 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
   return update_extracfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  VP8_COMP *cpi = ctx->cpi;
+  const unsigned int data = CAST(VP8E_SET_RTC_EXTERNAL_RATECTRL, args);
+  if (data) {
+    cpi->cyclic_refresh_mode_enabled = 0;
+    cpi->rt_always_update_correction_factor = 1;
+    cpi->rt_drop_recode_on_overshoot = 0;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
                                          void **mem_loc) {
   vpx_codec_err_t res = VPX_CODEC_OK;
@@ -661,7 +712,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
       ctx->priv->enc.total_encoders = 1;
     }
 
-    once(vp8_initialize_enc);
+    vp8_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
 
@@ -728,7 +779,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
 
 static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
                                     unsigned long duration,
-                                    unsigned long deadline) {
+                                    vpx_enc_deadline_t deadline) {
   int new_qc;
 
 #if !(CONFIG_REALTIME_ONLY)
@@ -818,7 +869,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
                                    const vpx_image_t *img, vpx_codec_pts_t pts,
                                    unsigned long duration,
                                    vpx_enc_frame_flags_t enc_flags,
-                                   unsigned long deadline) {
+                                   vpx_enc_deadline_t deadline) {
   volatile vpx_codec_err_t res = VPX_CODEC_OK;
   // Make a copy as volatile to avoid -Wclobbered with longjmp.
   volatile vpx_enc_frame_flags_t flags = enc_flags;
@@ -870,12 +921,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
-  if (setjmp(ctx->cpi->common.error.jmp)) {
-    ctx->cpi->common.error.setjmp = 0;
-    vpx_clear_system_state();
-    return VPX_CODEC_CORRUPT_FRAME;
-  }
-
   /* Initialize the encoder instance on the first frame*/
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
@@ -886,6 +931,14 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     unsigned char *cx_data_end;
     int comp_data_state = 0;
 
+    if (setjmp(ctx->cpi->common.error.jmp)) {
+      ctx->cpi->common.error.setjmp = 0;
+      res = update_error_state(ctx, &ctx->cpi->common.error);
+      vpx_clear_system_state();
+      return res;
+    }
+    ctx->cpi->common.error.setjmp = 1;
+
     /* Set up internal flags */
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) {
       ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1;
@@ -921,8 +974,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     cx_data_end = ctx->cx_data + cx_data_sz;
     lib_flags = 0;
 
-    ctx->cpi->common.error.setjmp = 1;
-
     while (cx_data_sz >= ctx->cx_data_sz / 2) {
       comp_data_state = vp8_get_compressed_data(
           ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp,
@@ -1018,6 +1069,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
         }
       }
     }
+    ctx->cpi->common.error.setjmp = 0;
   }
 
   return res;
@@ -1183,8 +1235,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
   if (data) {
     int res;
     vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode,
-                                (VPX_SCALING)scalemode.v_scaling_mode);
+    res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
+                                scalemode.v_scaling_mode);
 
     if (!res) {
       /*force next frame a key frame to effect scaling mode */
@@ -1223,6 +1275,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct },
   { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode },
   { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+  { VP8E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
   { -1, NULL },
 };
 
@@ -1250,13 +1303,13 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         0,  /* rc_resize_allowed */
         1,  /* rc_scaled_width */
         1,  /* rc_scaled_height */
-        60, /* rc_resize_down_thresold */
-        30, /* rc_resize_up_thresold */
+        60, /* rc_resize_down_thresh */
+        30, /* rc_resize_up_thresh */
 
         VPX_VBR,     /* rc_end_usage */
         { NULL, 0 }, /* rc_twopass_stats_in */
         { NULL, 0 }, /* rc_firstpass_mb_stats_in */
-        256,         /* rc_target_bandwidth */
+        256,         /* rc_target_bitrate */
         4,           /* rc_min_quantizer */
         63,          /* rc_max_quantizer */
         100,         /* rc_undershoot_pct */
@@ -1278,14 +1331,30 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
 
         VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
         { 0 },
-        { 0 }, /* ss_target_bitrate */
-        1,     /* ts_number_layers */
-        { 0 }, /* ts_target_bitrate */
-        { 0 }, /* ts_rate_decimator */
-        0,     /* ts_periodicity */
-        { 0 }, /* ts_layer_id */
-        { 0 }, /* layer_target_bitrate */
-        0      /* temporal_layering_mode */
+        { 0 },    /* ss_target_bitrate */
+        1,        /* ts_number_layers */
+        { 0 },    /* ts_target_bitrate */
+        { 0 },    /* ts_rate_decimator */
+        0,        /* ts_periodicity */
+        { 0 },    /* ts_layer_id */
+        { 0 },    /* layer_target_bitrate */
+        0,        /* temporal_layering_mode */
+        0,        /* use_vizier_rc_params */
+        { 1, 1 }, /* active_wq_factor */
+        { 1, 1 }, /* err_per_mb_factor */
+        { 1, 1 }, /* sr_default_decay_limit */
+        { 1, 1 }, /* sr_diff_factor */
+        { 1, 1 }, /* kf_err_per_mb_factor */
+        { 1, 1 }, /* kf_frame_min_boost_factor */
+        { 1, 1 }, /* kf_frame_max_boost_first_factor */
+        { 1, 1 }, /* kf_frame_max_boost_subs_factor */
+        { 1, 1 }, /* kf_max_total_boost_factor */
+        { 1, 1 }, /* gf_max_total_boost_factor */
+        { 1, 1 }, /* gf_frame_max_boost_factor */
+        { 1, 1 }, /* zm_factor */
+        { 1, 1 }, /* rd_mult_inter_qp_fac */
+        { 1, 1 }, /* rd_mult_arf_qp_fac */
+        { 1, 1 }, /* rd_mult_key_qp_fac */
     } },
 };
 
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index ba0714abe5..e81deaf4ea 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -20,6 +20,7 @@
 #include "vpx_version.h"
 #include "common/alloccommon.h"
 #include "common/common.h"
+#include "common/onyxc_int.h"
 #include "common/onyxd.h"
 #include "decoder/onyxd_int.h"
 #include "vpx_dsp/vpx_dsp_common.h"
@@ -162,7 +163,10 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
       si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
 
       /*printf("w=%d, h=%d\n", si->w, si->h);*/
-      if (!(si->h && si->w)) res = VPX_CODEC_CORRUPT_FRAME;
+      if (!(si->h && si->w)) {
+        si->w = si->h = 0;
+        res = VPX_CODEC_CORRUPT_FRAME;
+      }
     } else {
       res = VPX_CODEC_UNSUP_BITSTREAM;
     }
@@ -246,14 +250,14 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
     /* Store a pointer to this fragment and return. We haven't
      * received the complete frame yet, so we will wait with decoding.
      */
-    ctx->fragments.ptrs[ctx->fragments.count] = data;
-    ctx->fragments.sizes[ctx->fragments.count] = data_sz;
-    ctx->fragments.count++;
-    if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1) {
+    if (ctx->fragments.count >= MAX_PARTITIONS) {
       ctx->fragments.count = 0;
       *res = VPX_CODEC_INVALID_PARAM;
       return -1;
     }
+    ctx->fragments.ptrs[ctx->fragments.count] = data;
+    ctx->fragments.sizes[ctx->fragments.count] = data_sz;
+    ctx->fragments.count++;
     return 0;
   }
 
@@ -272,10 +276,10 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
 
 static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t *data, unsigned int data_sz,
-                                  void *user_priv, long deadline) {
+                                  void *user_priv) {
   volatile vpx_codec_err_t res;
   volatile unsigned int resolution_change = 0;
-  unsigned int w, h;
+  volatile unsigned int w, h;
 
   if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) {
     return 0;
@@ -301,17 +305,26 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
   }
 
   if (!ctx->decoder_init && !ctx->si.is_kf) res = VPX_CODEC_UNSUP_BITSTREAM;
+  if (!res && ctx->decoder_init && w == 0 && h == 0 && ctx->si.h == 0 &&
+      ctx->si.w == 0) {
+    VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+    assert(pbi != NULL);
+    assert(!pbi->common.error.setjmp);
+    res = VPX_CODEC_CORRUPT_FRAME;
+    vpx_internal_error(&pbi->common.error, res,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
 
   if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1;
 
 #if CONFIG_MULTITHREAD
   if (!res && ctx->restart_threads) {
-    struct frame_buffers *fb = &ctx->yv12_frame_buffers;
     VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
     VP8_COMMON *const pc = &pbi->common;
     if (setjmp(pbi->common.error.jmp)) {
-      vp8_remove_decoder_instances(fb);
-      vp8_zero(fb->pbi);
+      pbi->common.error.setjmp = 0;
+      vp8_decoder_remove_threads(pbi);
       vpx_clear_system_state();
       return VPX_CODEC_ERROR;
     }
@@ -349,7 +362,14 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
     }
 
     res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
-    if (res == VPX_CODEC_OK) ctx->decoder_init = 1;
+    if (res == VPX_CODEC_OK) {
+      ctx->decoder_init = 1;
+    } else {
+      /* on failure clear the cached resolution to ensure a full
+       * reallocation is attempted on resync. */
+      ctx->si.w = 0;
+      ctx->si.h = 0;
+    }
   }
 
   /* Set these even if already initialized.  The caller may have changed the
@@ -371,8 +391,6 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
       pc->Width = ctx->si.w;
       pc->Height = ctx->si.h;
       {
-        int prev_mb_rows = pc->mb_rows;
-
         if (setjmp(pbi->common.error.jmp)) {
           pbi->common.error.setjmp = 0;
           /* on failure clear the cached resolution to ensure a full
@@ -398,6 +416,12 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                              "Invalid frame height");
         }
 
+#if CONFIG_MULTITHREAD
+        if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
+          vp8mt_de_alloc_temp_buffers(pbi, pc->mb_rows);
+        }
+#endif
+
         if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height)) {
           vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                              "Failed to allocate frame buffers");
@@ -442,10 +466,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 
 #if CONFIG_MULTITHREAD
         if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
-          vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+          vp8mt_alloc_temp_buffers(pbi, pc->Width, 0);
         }
-#else
-        (void)prev_mb_rows;
 #endif
       }
 
@@ -486,12 +508,13 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
     pbi->restart_threads = 0;
 #endif
     ctx->user_priv = user_priv;
-    if (vp8dx_receive_compressed_data(pbi, deadline)) {
+    if (vp8dx_receive_compressed_data(pbi)) {
       res = update_error_state(ctx, &pbi->common.error);
     }
 
     /* get ready for the next series of fragments */
     ctx->fragments.count = 0;
+    pbi->common.error.setjmp = 0;
   }
 
   return res;
@@ -506,7 +529,6 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
    */
   if (!(*iter) && ctx->yv12_frame_buffers.pbi[0]) {
     YV12_BUFFER_CONFIG sd;
-    int64_t time_stamp = 0, time_end_stamp = 0;
     vp8_ppflags_t flags;
     vp8_zero(flags);
 
@@ -516,8 +538,7 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
       flags.noise_level = ctx->postproc_cfg.noise_level;
     }
 
-    if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd,
-                                 &time_stamp, &time_end_stamp, &flags)) {
+    if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, &flags)) {
       yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
 
       img = &ctx->img;
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
new file mode 100644
index 0000000000..261c316fd1
--- /dev/null
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -0,0 +1,429 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <new>
+#include "vp8/common/common.h"
+#include "vp8/vp8_ratectrl_rtc.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/ratectrl.h"
+#include "vpx_ports/system_state.h"
+
+namespace libvpx {
+/* Quant MOD */
+static const int kQTrans[] = {
+  0,  1,  2,  3,  4,  5,  7,   8,   9,   10,  12,  13,  15,  17,  18,  19,
+  20, 21, 23, 24, 25, 26, 27,  28,  29,  30,  31,  33,  35,  37,  39,  41,
+  43, 45, 47, 49, 51, 53, 55,  57,  59,  61,  64,  67,  70,  73,  76,  79,
+  82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
+  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  5,
+  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  8,  8,  8,  8,  9,  9,  10, 10,
+  10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16,
+  16, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+  22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30
+};
+
+static const unsigned char inter_minq[QINDEX_RANGE] = {
+  0,  0,  1,  1,  2,  3,  3,  4,  4,  5,  6,  6,  7,  8,  8,  9,  9,  10, 11,
+  11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24,
+  24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38,
+  39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53,
+  54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69,
+  70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86,
+  87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
+};
+
+static int rescale(int val, int num, int denom) {
+  int64_t llnum = num;
+  int64_t llden = denom;
+  int64_t llval = val;
+
+  return (int)(llval * llnum / llden);
+}
+
+std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
+    const VP8RateControlRtcConfig &cfg) {
+  std::unique_ptr<VP8RateControlRTC> rc_api(new (std::nothrow)
+                                                VP8RateControlRTC());
+  if (!rc_api) return nullptr;
+  rc_api->cpi_ = static_cast<VP8_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
+  if (!rc_api->cpi_) return nullptr;
+  vp8_zero(*rc_api->cpi_);
+
+  if (!rc_api->InitRateControl(cfg)) return nullptr;
+
+  return rc_api;
+}
+
+VP8RateControlRTC::~VP8RateControlRTC() {
+  if (cpi_) {
+    vpx_free(cpi_->gf_active_flags);
+    vpx_free(cpi_);
+  }
+}
+
+bool VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+  oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+  cpi_->pass = 0;
+  cm->show_frame = 1;
+  oxcf->drop_frames_water_mark = 0;
+  cm->current_video_frame = 0;
+  cpi_->auto_gold = 1;
+  cpi_->key_frame_count = 1;
+  cpi_->rate_correction_factor = 1.0;
+  cpi_->key_frame_rate_correction_factor = 1.0;
+  cpi_->cyclic_refresh_mode_enabled = 0;
+  cpi_->auto_worst_q = 1;
+  cpi_->kf_overspend_bits = 0;
+  cpi_->kf_bitrate_adjustment = 0;
+  cpi_->gf_overspend_bits = 0;
+  cpi_->non_gf_bitrate_adjustment = 0;
+  if (!UpdateRateControl(rc_cfg)) return false;
+  cpi_->buffer_level = oxcf->starting_buffer_level;
+  cpi_->bits_off_target = oxcf->starting_buffer_level;
+  return true;
+}
+
+bool VP8RateControlRTC::UpdateRateControl(
+    const VP8RateControlRtcConfig &rc_cfg) {
+  if (rc_cfg.ts_number_layers < 1 ||
+      rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS) {
+    return false;
+  }
+
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+  const unsigned int prev_number_of_layers = oxcf->number_of_layers;
+  vpx_clear_system_state();
+  cm->Width = rc_cfg.width;
+  cm->Height = rc_cfg.height;
+  oxcf->Width = rc_cfg.width;
+  oxcf->Height = rc_cfg.height;
+  oxcf->worst_allowed_q = kQTrans[rc_cfg.max_quantizer];
+  oxcf->best_allowed_q = kQTrans[rc_cfg.min_quantizer];
+  cpi_->worst_quality = oxcf->worst_allowed_q;
+  cpi_->best_quality = oxcf->best_allowed_q;
+  cpi_->output_framerate = rc_cfg.framerate;
+  oxcf->target_bandwidth =
+      static_cast<unsigned int>(1000 * rc_cfg.target_bandwidth);
+  cpi_->ref_framerate = cpi_->output_framerate;
+  oxcf->fixed_q = -1;
+  oxcf->error_resilient_mode = 1;
+  oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level_in_ms = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size_in_ms = rc_cfg.buf_sz;
+  oxcf->starting_buffer_level = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size = rc_cfg.buf_sz;
+  oxcf->number_of_layers = rc_cfg.ts_number_layers;
+  cpi_->buffered_mode = oxcf->optimal_buffer_level > 0;
+  oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
+  oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  if (oxcf->drop_frames_water_mark > 0) cpi_->drop_frames_allowed = 1;
+  cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  cpi_->framerate = rc_cfg.framerate;
+  for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) {
+    cpi_->prior_key_frame_distance[i] =
+        static_cast<int>(cpi_->output_framerate);
+  }
+  oxcf->screen_content_mode = rc_cfg.is_screen;
+  if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) {
+    memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate,
+           sizeof(rc_cfg.layer_target_bitrate));
+    memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator,
+           sizeof(rc_cfg.ts_rate_decimator));
+    if (cm->current_video_frame == 0) {
+      double prev_layer_framerate = 0;
+      for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
+        vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
+        prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+      }
+    } else if (oxcf->number_of_layers != prev_number_of_layers) {
+      // The number of temporal layers has changed, so reset/initialize the
+      // temporal layer context for the new layer configuration: this means
+      // calling vp8_reset_temporal_layer_change() below.
+
+      // Start at the base of the pattern cycle, so set the layer id to 0 and
+      // reset the temporal pattern counter.
+      // TODO(marpan/jianj): don't think lines 148-151 are needed (user controls
+      // the layer_id) so remove.
+      if (cpi_->temporal_layer_id > 0) {
+        cpi_->temporal_layer_id = 0;
+      }
+      cpi_->temporal_pattern_counter = 0;
+
+      vp8_reset_temporal_layer_change(cpi_, oxcf,
+                                      static_cast<int>(prev_number_of_layers));
+    }
+  }
+
+  cpi_->total_actual_bits = 0;
+  cpi_->total_target_vs_actual = 0;
+
+  cm->mb_rows = cm->Height >> 4;
+  cm->mb_cols = cm->Width >> 4;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+  cm->mode_info_stride = cm->mb_cols + 1;
+
+  // For temporal layers: starting/maximum/optimal_buffer_level is already set
+  // via vp8_init_temporal_layer_context() or vp8_reset_temporal_layer_change().
+  if (oxcf->number_of_layers <= 1 && prev_number_of_layers <= 1) {
+    oxcf->starting_buffer_level =
+        rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
+    /* Set or reset optimal and maximum buffer levels. */
+    if (oxcf->optimal_buffer_level == 0) {
+      oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
+    } else {
+      oxcf->optimal_buffer_level = rescale((int)oxcf->optimal_buffer_level,
+                                           oxcf->target_bandwidth, 1000);
+    }
+    if (oxcf->maximum_buffer_size == 0) {
+      oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
+    } else {
+      oxcf->maximum_buffer_size =
+          rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+    }
+  }
+
+  if (cpi_->bits_off_target > oxcf->maximum_buffer_size) {
+    cpi_->bits_off_target = oxcf->maximum_buffer_size;
+    cpi_->buffer_level = cpi_->bits_off_target;
+  }
+
+  vp8_new_framerate(cpi_, cpi_->framerate);
+  vpx_clear_system_state();
+  return true;
+}
+
+FrameDropDecision VP8RateControlRTC::ComputeQP(
+    const VP8FrameParamsQpRTC &frame_params) {
+  VP8_COMMON *const cm = &cpi_->common;
+  vpx_clear_system_state();
+  if (cpi_->oxcf.number_of_layers > 1) {
+    cpi_->temporal_layer_id = frame_params.temporal_layer_id;
+    const int layer = frame_params.temporal_layer_id;
+    vp8_update_layer_contexts(cpi_);
+    /* Restore layer specific context & set frame rate */
+    vp8_restore_layer_context(cpi_, layer);
+    vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
+  }
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
+  cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {
+    cpi_->common.frame_flags |= FRAMEFLAGS_KEY;
+  }
+
+  cpi_->per_frame_bandwidth = static_cast<int>(
+      round(cpi_->oxcf.target_bandwidth / cpi_->output_framerate));
+  if (vp8_check_drop_buffer(cpi_)) {
+    if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+    return FrameDropDecision::kDrop;
+  }
+
+  if (!vp8_pick_frame_size(cpi_)) {
+    cm->current_video_frame++;
+    cpi_->frames_since_key++;
+    cpi_->ext_refresh_frame_flags_pending = 0;
+    if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+    return FrameDropDecision::kDrop;
+  }
+
+  if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level &&
+      cpi_->buffered_mode) {
+    /* Max adjustment is 1/4 */
+    int Adjustment = cpi_->active_worst_quality / 4;
+    if (Adjustment) {
+      int buff_lvl_step;
+      if (cpi_->buffer_level < cpi_->oxcf.maximum_buffer_size) {
+        buff_lvl_step = (int)((cpi_->oxcf.maximum_buffer_size -
+                               cpi_->oxcf.optimal_buffer_level) /
+                              Adjustment);
+        if (buff_lvl_step) {
+          Adjustment =
+              (int)((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) /
+                    buff_lvl_step);
+        } else {
+          Adjustment = 0;
+        }
+      }
+      cpi_->active_worst_quality -= Adjustment;
+      if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+        cpi_->active_worst_quality = cpi_->active_best_quality;
+      }
+    }
+  }
+
+  if (cpi_->ni_frames > 150) {
+    int q = cpi_->active_worst_quality;
+    if (cm->frame_type == KEY_FRAME) {
+      cpi_->active_best_quality = kf_high_motion_minq[q];
+    } else {
+      cpi_->active_best_quality = inter_minq[q];
+    }
+
+    if (cpi_->buffer_level >= cpi_->oxcf.maximum_buffer_size) {
+      cpi_->active_best_quality = cpi_->best_quality;
+
+    } else if (cpi_->buffer_level > cpi_->oxcf.optimal_buffer_level) {
+      int Fraction =
+          (int)(((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) * 128) /
+                (cpi_->oxcf.maximum_buffer_size -
+                 cpi_->oxcf.optimal_buffer_level));
+      int min_qadjustment =
+          ((cpi_->active_best_quality - cpi_->best_quality) * Fraction) / 128;
+
+      cpi_->active_best_quality -= min_qadjustment;
+    }
+  }
+
+  /* Clip the active best and worst quality values to limits */
+  if (cpi_->active_worst_quality > cpi_->worst_quality) {
+    cpi_->active_worst_quality = cpi_->worst_quality;
+  }
+  if (cpi_->active_best_quality < cpi_->best_quality) {
+    cpi_->active_best_quality = cpi_->best_quality;
+  }
+  if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+    cpi_->active_worst_quality = cpi_->active_best_quality;
+  }
+
+  q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target);
+  vp8_set_quantizer(cpi_, q_);
+  vpx_clear_system_state();
+  return FrameDropDecision::kOk;
+}
+
+int VP8RateControlRTC::GetQP() const { return q_; }
+
+int VP8RateControlRTC::GetLoopfilterLevel() const {
+  VP8_COMMON *cm = &cpi_->common;
+  const double qp = q_;
+
+  // This model is from linear regression
+  if (cm->Width * cm->Height <= 320 * 240) {
+    cm->filter_level = static_cast<int>(0.352685 * qp + 2.957774);
+  } else if (cm->Width * cm->Height <= 640 * 480) {
+    cm->filter_level = static_cast<int>(0.485069 * qp - 0.534462);
+  } else {
+    cm->filter_level = static_cast<int>(0.314875 * qp + 7.959003);
+  }
+
+  int min_filter_level = 0;
+  // This logic is from get_min_filter_level() in picklpf.c
+  if (q_ > 6 && q_ <= 16) {
+    min_filter_level = 1;
+  } else {
+    min_filter_level = (q_ / 8);
+  }
+
+  const int max_filter_level = 63;
+  if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level;
+  if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level;
+
+  return cm->filter_level;
+}
+
+void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+  VP8_COMMON *const cm = &cpi_->common;
+  vpx_clear_system_state();
+  cpi_->total_byte_count += encoded_frame_size;
+  cpi_->projected_frame_size = static_cast<int>(encoded_frame_size << 3);
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      cpi_->layer_context[i].total_byte_count += encoded_frame_size;
+    }
+  }
+
+  vp8_update_rate_correction_factors(cpi_, 2);
+
+  cpi_->last_q[cm->frame_type] = cm->base_qindex;
+
+  if (cm->frame_type == KEY_FRAME) {
+    vp8_adjust_key_frame_context(cpi_);
+  }
+
+  /* Keep a record of ambient average Q. */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->avg_frame_qindex =
+        (2 + 3 * cpi_->avg_frame_qindex + cm->base_qindex) >> 2;
+  }
+  /* Keep a record from which we can calculate the average Q excluding
+   * key frames.
+   */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->ni_frames++;
+    /* Damp value for first few frames */
+    if (cpi_->ni_frames > 150) {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi = (cpi_->ni_tot_qi / cpi_->ni_frames);
+    } else {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi =
+          ((cpi_->ni_tot_qi / cpi_->ni_frames) + cpi_->worst_quality + 1) / 2;
+    }
+
+    /* If the average Q is higher than what was used in the last
+     * frame (after going through the recode loop to keep the frame
+     * size within range) then use the last frame value - 1. The -1
+     * is designed to stop Q and hence the data rate, from
+     * progressively falling away during difficult sections, but at
+     * the same time reduce the number of itterations around the
+     * recode loop.
+     */
+    if (q_ > cpi_->ni_av_qi) cpi_->ni_av_qi = q_ - 1;
+  }
+
+  cpi_->bits_off_target +=
+      cpi_->av_per_frame_bandwidth - cpi_->projected_frame_size;
+  if (cpi_->bits_off_target > cpi_->oxcf.maximum_buffer_size) {
+    cpi_->bits_off_target = cpi_->oxcf.maximum_buffer_size;
+  }
+
+  cpi_->total_actual_bits += cpi_->projected_frame_size;
+  cpi_->buffer_level = cpi_->bits_off_target;
+
+  /* Propagate values to higher temporal layers */
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      LAYER_CONTEXT *lc = &cpi_->layer_context[i];
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi_->projected_frame_size);
+
+      lc->bits_off_target += bits_off_for_this_layer;
+
+      /* Clip buffer level to maximum buffer size for the layer */
+      if (lc->bits_off_target > lc->maximum_buffer_size) {
+        lc->bits_off_target = lc->maximum_buffer_size;
+      }
+
+      lc->total_actual_bits += cpi_->projected_frame_size;
+      lc->total_target_vs_actual += bits_off_for_this_layer;
+      lc->buffer_level = lc->bits_off_target;
+    }
+  }
+
+  cpi_->common.current_video_frame++;
+  cpi_->frames_since_key++;
+
+  if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+  vpx_clear_system_state();
+}
+}  // namespace libvpx
diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
new file mode 100644
index 0000000000..59fb607526
--- /dev/null
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_RATECTRL_RTC_H_
+#define VPX_VP8_RATECTRL_RTC_H_
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include "vpx/internal/vpx_ratectrl_rtc.h"
+
+struct VP8_COMP;
+
+namespace libvpx {
+struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
+ public:
+  VP8RateControlRtcConfig() {
+    memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+    memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
+  }
+};
+
+struct VP8FrameParamsQpRTC {
+  RcFrameType frame_type;
+  int temporal_layer_id;
+};
+
+class VP8RateControlRTC {
+ public:
+  static std::unique_ptr<VP8RateControlRTC> Create(
+      const VP8RateControlRtcConfig &cfg);
+  ~VP8RateControlRTC();
+
+  bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
+  // GetQP() needs to be called after ComputeQP() to get the latest QP
+  int GetQP() const;
+  // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
+  // level is calculated from frame qp.
+  int GetLoopfilterLevel() const;
+  // ComputeQP computes the QP if the frame is not dropped (kOk return),
+  // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+  // are not to be called.
+  FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params);
+  // Feedback to rate control with the size of current encoded frame
+  void PostEncodeUpdate(uint64_t encoded_frame_size);
+
+ private:
+  VP8RateControlRTC() {}
+  bool InitRateControl(const VP8RateControlRtcConfig &cfg);
+  struct VP8_COMP *cpi_;
+  int q_;
+};
+
+}  // namespace libvpx
+
+#endif  // VPX_VP8_RATECTRL_RTC_H_
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 3a8f8ea45a..b4b3fda9ea 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -124,4 +124,9 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 endif
 
+# common (loongarch LSX intrinsics)
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/vp8_quantize_lsx.c
+
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
index 219ff63cb8..b43d7fa4f9 100644
--- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
@@ -18,7 +18,7 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-// Use macros to make sure argument lane is passed in as an constant integer.
+// Use macros to make sure argument lane is passed in as a constant integer.
 
 #define vmull_lane_s32_dual(in, c, lane, out)                          \
   do {                                                                 \
@@ -64,9 +64,9 @@ highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) {
 
 #define highbd_iadst_half_butterfly(in, c, lane, out) \
   do {                                                \
-    int64x2x2_t t[2];                                 \
-    vmull_lane_s32_dual(in, c, lane, t);              \
-    out = highbd_dct_const_round_shift_low_8(t);      \
+    int64x2x2_t _t[2];                                \
+    vmull_lane_s32_dual(in, c, lane, _t);             \
+    out = highbd_dct_const_round_shift_low_8(_t);     \
   } while (0)
 
 #define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 5702dca718..9e73e40ea0 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -65,6 +65,7 @@ static void free_seg_map(VP9_COMMON *cm) {
     vpx_free(cm->seg_map_array[i]);
     cm->seg_map_array[i] = NULL;
   }
+  cm->seg_map_alloc_size = 0;
 
   cm->current_frame_seg_map = NULL;
   cm->last_frame_seg_map = NULL;
@@ -73,6 +74,8 @@ static void free_seg_map(VP9_COMMON *cm) {
 void vp9_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
+  if (!pool) return;
+
   for (i = 0; i < FRAME_BUFFERS; ++i) {
     if (!pool->frame_bufs[i].released &&
         pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
@@ -100,12 +103,13 @@ void vp9_free_postproc_buffers(VP9_COMMON *cm) {
 }
 
 void vp9_free_context_buffers(VP9_COMMON *cm) {
-  cm->free_mi(cm);
+  if (cm->free_mi) cm->free_mi(cm);
   free_seg_map(cm);
   vpx_free(cm->above_context);
   cm->above_context = NULL;
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
+  cm->above_context_alloc_cols = 0;
   vpx_free(cm->lf.lfm);
   cm->lf.lfm = NULL;
 }
@@ -131,13 +135,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->free_mi(cm);
     if (cm->alloc_mi(cm, new_mi_size)) goto fail;
   }
-
-  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
-    // Create the segmentation map structure and set to 0.
-    free_seg_map(cm);
-    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
-  }
-
   if (cm->above_context_alloc_cols < cm->mi_cols) {
     vpx_free(cm->above_context);
     cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
@@ -152,6 +149,12 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
+  }
+
   if (vp9_alloc_loop_filter(cm)) goto fail;
 
   return 0;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index d7de46cf4f..aa13d8a0d5 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -54,7 +54,7 @@ typedef struct {
 // decoder implementation modules critically rely on the defined entry values
 // specified herein. They should be refactored concurrently.
 
-#define NONE (-1)
+#define NO_REF_FRAME (-1)
 #define INTRA_FRAME 0
 #define LAST_FRAME 1
 #define GOLDEN_FRAME 2
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index e3c5535ddb..d63bad93d1 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -27,10 +27,10 @@ extern "C" {
 
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src)              \
-  {                                      \
+  do {                                   \
     assert(sizeof(dest) == sizeof(src)); \
     memcpy(dest, src, sizeof(src));      \
-  }
+  } while (0)
 
 // Use this for variably-sized arrays.
 #define vp9_copy_array(dest, src, n)           \
@@ -46,25 +46,6 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(cm, lval, expr)                                     \
-  do {                                                                      \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,                 \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(cm, lval, expr)                     \
-  do {                                                      \
-    (lval) = (expr);                                        \
-    if (!(lval))                                            \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);      \
-  } while (0)
-#endif
-
 #define VP9_SYNC_CODE_0 0x49
 #define VP9_SYNC_CODE_1 0x83
 #define VP9_SYNC_CODE_2 0x42
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index bda824de3c..9289fc9e1f 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -381,7 +381,6 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
-    int j;
     unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
     unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
     unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c
index a254e79d20..889b809e50 100644
--- a/vp9/common/vp9_frame_buffers.c
+++ b/vp9/common/vp9_frame_buffers.c
@@ -14,14 +14,17 @@
 #include "vpx_mem/vpx_mem.h"
 
 int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
   assert(list != NULL);
   vp9_free_internal_frame_buffers(list);
 
-  list->num_internal_frame_buffers =
-      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
-  list->int_fb = (InternalFrameBuffer *)vpx_calloc(
-      list->num_internal_frame_buffers, sizeof(*list->int_fb));
-  return (list->int_fb == NULL);
+  list->int_fb =
+      (InternalFrameBuffer *)vpx_calloc(num_buffers, sizeof(*list->int_fb));
+  if (list->int_fb) {
+    list->num_internal_frame_buffers = num_buffers;
+    return 0;
+  }
+  return -1;
 }
 
 void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
@@ -35,6 +38,7 @@ void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
   }
   vpx_free(list->int_fb);
   list->int_fb = NULL;
+  list->num_internal_frame_buffers = 0;
 }
 
 int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 69069042cc..71be0f310d 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -150,6 +150,7 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
 
 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
+  assert(((intptr_t)input) % 32 == 0);
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob == 1) /* DC only DCT coefficient. */
@@ -164,6 +165,7 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
 
 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
+  assert(((intptr_t)input) % 32 == 0);
   if (eob == 1)
     vpx_idct32x32_1_add(input, dest, stride);
   else if (eob <= 34)
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 95d6029f3b..1a9d45ae77 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -932,32 +932,32 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       break;
     default:
       for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
-        const int shift_y = shift_32_y[idx_32];
-        const int shift_uv = shift_32_uv[idx_32];
+        const int shift_y_32 = shift_32_y[idx_32];
+        const int shift_uv_32 = shift_32_uv[idx_32];
         const int mi_32_col_offset = ((idx_32 & 1) << 2);
         const int mi_32_row_offset = ((idx_32 >> 1) << 2);
         if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
           continue;
         switch (mip[0]->sb_type) {
           case BLOCK_32X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             break;
           case BLOCK_32X16:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             if (mi_32_row_offset + 2 >= max_rows) continue;
             mip2 = mip + mode_info_stride * 2;
-            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+            build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm);
             break;
           case BLOCK_16X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             if (mi_32_col_offset + 2 >= max_cols) continue;
             mip2 = mip + 2;
-            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+            build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm);
             break;
           default:
             for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
-              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
-              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+              const int shift_y_16 = shift_y_32 + shift_16_y[idx_16];
+              const int shift_uv_16 = shift_uv_32 + shift_16_uv[idx_16];
               const int mi_16_col_offset =
                   mi_32_col_offset + ((idx_16 & 1) << 1);
               const int mi_16_row_offset =
@@ -968,28 +968,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
 
               switch (mip[0]->sb_type) {
                 case BLOCK_16X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   break;
                 case BLOCK_16X8:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   if (mi_16_row_offset + 1 >= max_rows) continue;
                   mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2[0], shift_y + 8, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_16 + 8, lfm);
                   break;
                 case BLOCK_8X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   if (mi_16_col_offset + 1 >= max_cols) continue;
                   mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2[0], shift_y + 1, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_16 + 1, lfm);
                   break;
                 default: {
-                  const int shift_y =
-                      shift_32_y[idx_32] + shift_16_y[idx_16] + shift_8_y[0];
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  const int shift_y_8_0 = shift_y_16 + shift_8_y[0];
+                  build_masks(lfi_n, mip[0], shift_y_8_0, shift_uv_16, lfm);
                   mip += offset[0];
                   for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
-                    const int shift_y = shift_32_y[idx_32] +
-                                        shift_16_y[idx_16] + shift_8_y[idx_8];
+                    const int shift_y_8 = shift_y_16 + shift_8_y[idx_8];
                     const int mi_8_col_offset =
                         mi_16_col_offset + ((idx_8 & 1));
                     const int mi_8_row_offset =
@@ -998,7 +996,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                     if (mi_8_col_offset >= max_cols ||
                         mi_8_row_offset >= max_rows)
                       continue;
-                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                    build_y_mask(lfi_n, mip[0], shift_y_8, lfm);
                   }
                   break;
                 }
@@ -1180,7 +1178,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
     }
 
     // Disable filtering on the leftmost column
-    border_mask = ~(mi_col == 0 ? 1 : 0);
+    border_mask = ~(mi_col == 0 ? 1u : 0u);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_vert(
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index e76d771b8d..cf60fa40fd 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -217,6 +217,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
   const int bsl = b_width_log2_lookup[bs];
   PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
   const BLOCK_SIZE subsize = get_subsize(bs, partition);
+  BLOCK_SIZE mfqe_bs, bs_tmp;
 
   if (cur_bs < BLOCK_8X8) {
     // If there are blocks smaller than 8x8, it must be on the boundary.
@@ -236,7 +237,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
     uv_offset = 8;
   }
   switch (partition) {
-    BLOCK_SIZE mfqe_bs, bs_tmp;
     case PARTITION_HORZ:
       if (bs == BLOCK_64X64) {
         mfqe_bs = BLOCK_64X32;
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 6f9c6985f0..1cfc12f6fa 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -75,12 +75,10 @@ typedef struct {
 
   // TODO(angiebird): Set frame_index/frame_coding_index on the decoder side
   // properly.
-  int frame_index;  // Display order in the video, it's equivalent to the
-                    // show_idx defined in EncodeFrameInfo.
-#if CONFIG_RATE_CTRL
+  int frame_index;         // Display order in the video, it's equivalent to the
+                           // show_idx defined in EncodeFrameInfo.
   int frame_coding_index;  // The coding order (starting from zero) of this
                            // frame.
-#endif                     // CONFIG_RATE_CTRL
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
 } RefCntBuffer;
@@ -240,13 +238,11 @@ typedef struct VP9Common {
   // TODO(angiebird): current_video_frame/current_frame_coding_index into a
   // structure
   unsigned int current_video_frame;
-#if CONFIG_RATE_CTRL
   // Each show or no show frame is assigned with a coding index based on its
   // coding order (starting from zero).
 
   // Current frame's coding index.
   int current_frame_coding_index;
-#endif
   BITSTREAM_PROFILE profile;
 
   // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
@@ -276,9 +272,7 @@ typedef struct VP9Common {
 
 static INLINE void init_frame_indexes(VP9_COMMON *cm) {
   cm->current_video_frame = 0;
-#if CONFIG_RATE_CTRL
   cm->current_frame_coding_index = 0;
-#endif  // CONFIG_RATE_CTRL
 }
 
 static INLINE void update_frame_indexes(VP9_COMMON *cm, int show_frame) {
@@ -287,9 +281,7 @@ static INLINE void update_frame_indexes(VP9_COMMON *cm, int show_frame) {
     // update not a real frame
     ++cm->current_video_frame;
   }
-#if CONFIG_RATE_CTRL
   ++cm->current_frame_coding_index;
-#endif  // CONFIG_RATE_CTRL
 }
 
 typedef struct {
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index d2c8535b01..96519f0051 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -360,6 +360,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
     if (!cm->postproc_state.limits) {
       cm->postproc_state.limits =
           vpx_calloc(unscaled_width, sizeof(*cm->postproc_state.limits));
+      if (!cm->postproc_state.limits) return 1;
     }
   }
 
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index ff59ff5042..0a60b853d8 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -13,12 +13,13 @@
 #include "./vpx_scale_rtcd.h"
 #include "./vpx_config.h"
 
-#include "vpx/vpx_integer.h"
-
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
+#include "vpx/vpx_integer.h"
+#include "vpx_scale/yv12config.h"
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_build_inter_predictor(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
@@ -158,18 +159,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       // Co-ordinate of containing block to pixel precision.
       const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
       const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+      const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+      uint8_t *buf_array[] = { ref_buf->y_buffer, ref_buf->u_buffer,
+                               ref_buf->v_buffer };
+      const int stride_array[] = { ref_buf->y_stride, ref_buf->uv_stride,
+                                   ref_buf->uv_stride };
 #if 0  // CONFIG_BETTER_HW_COMPATIBILITY
       assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
              xd->mi[0]->sb_type != BLOCK_8X4);
       assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) &&
              mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x)));
 #endif
-      if (plane == 0)
-        pre_buf->buf = xd->block_refs[ref]->buf->y_buffer;
-      else if (plane == 1)
-        pre_buf->buf = xd->block_refs[ref]->buf->u_buffer;
-      else
-        pre_buf->buf = xd->block_refs[ref]->buf->v_buffer;
+      pre_buf->buf = buf_array[plane];
+      pre_buf->stride = stride_array[plane];
 
       pre_buf->buf +=
           scaled_buffer_offset(x_start + x, y_start + y, pre_buf->stride, sf);
diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c
index d8c870aa3f..37762ca15a 100644
--- a/vp9/common/vp9_rtcd.c
+++ b/vp9/common/vp9_rtcd.c
@@ -12,8 +12,4 @@
 #include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vp9_rtcd() {
-  // TODO(JBB): Remove this once, by insuring that both the encoder and
-  // decoder setup functions are protected by once();
-  once(setup_rtcd_internal);
-}
+void vp9_rtcd() { once(setup_rtcd_internal); }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 6980b9b7fb..3ecbd5417f 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -23,7 +23,9 @@ ()
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -127,24 +129,21 @@ ()
 add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
 
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
+specialize qw/vp9_block_error_fp neon avx2 sse2/;
 
-add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64";
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
 
-add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  specialize qw/vp9_block_error avx2 sse2/;
-
-  specialize qw/vp9_block_error_fp avx2 sse2/;
+  specialize qw/vp9_block_error neon avx2 sse2/;
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error sse2/;
+  specialize qw/vp9_highbd_block_error neon sse2/;
 } else {
-  specialize qw/vp9_block_error avx2 msa sse2/;
-
-  specialize qw/vp9_block_error_fp neon avx2 sse2/;
+  specialize qw/vp9_block_error neon avx2 msa sse2/;
 }
 
 # fdct functions
@@ -159,9 +158,9 @@ ()
 
 # Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH
 # is off.
-specialize qw/vp9_fht4x4 sse2/;
-specialize qw/vp9_fht8x8 sse2/;
-specialize qw/vp9_fht16x16 sse2/;
+specialize qw/vp9_fht4x4 sse2 neon/;
+specialize qw/vp9_fht8x8 sse2 neon/;
+specialize qw/vp9_fht16x16 sse2 neon/;
 specialize qw/vp9_fwht4x4 sse2/;
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
   # Note that these specializations are appended to the above ones.
@@ -174,19 +173,19 @@ ()
 #
 # Motion search
 #
-add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx/;
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv";
+specialize qw/vp9_diamond_search_sad neon/;
 
 #
 # Apply temporal filter
 #
 if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
 add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
-specialize qw/vp9_apply_temporal_filter sse4_1/;
+specialize qw/vp9_apply_temporal_filter sse4_1 neon/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count";
-    specialize qw/vp9_highbd_apply_temporal_filter sse4_1/;
+    specialize qw/vp9_highbd_apply_temporal_filter sse4_1 neon/;
   }
 }
 
@@ -195,16 +194,21 @@ ()
 
   # ENCODEMB INVOKE
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+  specialize qw/vp9_highbd_quantize_fp avx2 neon/;
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+  specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht4x4 neon/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht8x8 neon/;
 
   add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht16x16 neon/;
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
index 0fef263510..adacb7ef96 100644
--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
@@ -511,190 +511,191 @@ DECLARE_ALIGNED(16, static const int16_t,
   959, 990,  991, 1022, 0,   0,
 };
 
+// Add 1 to iscan values. This represents the EOB position instead of the index.
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = {
-  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+  1, 3, 6, 9, 2, 4, 10, 13, 5, 8, 12, 15, 7, 11, 14, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = {
-  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+  1, 4, 8, 12, 2, 6, 10, 13, 3, 7, 11, 15, 5, 9, 14, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = {
-  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+  1, 2, 4, 6, 3, 5, 7, 10, 8, 9, 12, 14, 11, 13, 15, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = {
-  0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
-  2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
-  6,  12, 21, 27, 35, 43, 52, 58, 9,  17, 25, 33, 39, 48, 55, 60,
-  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+  1,  4,  9,  16, 23, 33, 41, 48, 2,  6,  12, 19, 27, 35, 45, 52,
+  3,  8,  14, 21, 29, 39, 47, 55, 5,  11, 17, 25, 32, 42, 51, 57,
+  7,  13, 22, 28, 36, 44, 53, 59, 10, 18, 26, 34, 40, 49, 56, 61,
+  15, 24, 31, 38, 46, 54, 60, 63, 20, 30, 37, 43, 50, 58, 62, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = {
-  0,  1,  2,  5,  8,  12, 19, 24, 3,  4,  7,  10, 15, 20, 30, 39,
-  6,  9,  13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
-  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
-  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+  1,  2,  3,  6,  9,  13, 20, 25, 4,  5,  8,  11, 16, 21, 31, 40,
+  7,  10, 14, 17, 22, 28, 38, 47, 12, 15, 18, 24, 29, 35, 45, 53,
+  19, 23, 26, 32, 36, 42, 51, 58, 27, 30, 34, 39, 44, 50, 56, 60,
+  33, 37, 43, 48, 52, 55, 61, 62, 41, 46, 49, 54, 57, 59, 63, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = {
-  0,  2,  5,  9,  14, 22, 31, 37, 1,  4,  8,  13, 19, 26, 38, 44,
-  3,  6,  10, 17, 24, 30, 42, 49, 7,  11, 15, 21, 29, 36, 47, 53,
-  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
-  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+  1,  3,  6,  10, 15, 23, 32, 38, 2,  5,  9,  14, 20, 27, 39, 45,
+  4,  7,  11, 18, 25, 31, 43, 50, 8,  12, 16, 22, 30, 37, 48, 54,
+  13, 17, 21, 28, 35, 44, 53, 58, 19, 24, 29, 36, 42, 49, 57, 61,
+  26, 33, 40, 46, 51, 56, 60, 63, 34, 41, 47, 52, 55, 59, 62, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = {
-  0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
-  1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
-  2,  8,  16,  25,  38,  52,  67,  83,  101, 116, 136, 157, 172, 190, 205, 216,
-  3,  10, 18,  29,  41,  55,  71,  89,  103, 119, 141, 159, 176, 194, 208, 218,
-  5,  12, 21,  32,  45,  58,  74,  93,  104, 123, 144, 164, 179, 196, 210, 223,
-  7,  15, 26,  37,  49,  63,  78,  96,  112, 129, 146, 166, 182, 200, 215, 228,
-  9,  19, 28,  39,  54,  69,  86,  102, 117, 132, 151, 170, 187, 206, 220, 230,
-  13, 24, 35,  46,  60,  73,  91,  108, 122, 137, 154, 174, 189, 207, 224, 235,
-  17, 30, 40,  53,  66,  82,  98,  115, 126, 142, 161, 180, 197, 213, 227, 237,
-  22, 36, 48,  62,  76,  92,  105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
-  27, 44, 56,  70,  84,  99,  113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
-  33, 51, 68,  79,  94,  110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
-  42, 61, 77,  90,  106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
-  50, 72, 87,  100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
-  57, 80, 97,  111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
-  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+  1,  5,  12,  21,  32,  44,  60,  76,  86,  110, 131, 151, 166, 182, 196, 199,
+  2,  7,  15,  24,  35,  48,  65,  82,  96,  115, 136, 154, 172, 189, 202, 213,
+  3,  9,  17,  26,  39,  53,  68,  84,  102, 117, 137, 158, 173, 191, 206, 217,
+  4,  11, 19,  30,  42,  56,  72,  90,  104, 120, 142, 160, 177, 195, 209, 219,
+  6,  13, 22,  33,  46,  59,  75,  94,  105, 124, 145, 165, 180, 197, 211, 224,
+  8,  16, 27,  38,  50,  64,  79,  97,  113, 130, 147, 167, 183, 201, 216, 229,
+  10, 20, 29,  40,  55,  70,  87,  103, 118, 133, 152, 171, 188, 207, 221, 231,
+  14, 25, 36,  47,  61,  74,  92,  109, 123, 138, 155, 175, 190, 208, 225, 236,
+  18, 31, 41,  54,  67,  83,  99,  116, 127, 143, 162, 181, 198, 214, 228, 238,
+  23, 37, 49,  63,  77,  93,  106, 121, 134, 148, 168, 187, 204, 220, 233, 241,
+  28, 45, 57,  71,  85,  100, 114, 128, 141, 157, 176, 194, 210, 227, 237, 245,
+  34, 52, 69,  80,  95,  111, 126, 139, 150, 163, 185, 203, 218, 230, 242, 248,
+  43, 62, 78,  91,  107, 122, 135, 149, 161, 174, 192, 212, 226, 239, 246, 252,
+  51, 73, 88,  101, 119, 129, 146, 159, 169, 184, 205, 223, 234, 243, 250, 254,
+  58, 81, 98,  112, 132, 144, 156, 170, 179, 193, 215, 232, 240, 247, 251, 255,
+  66, 89, 108, 125, 140, 153, 164, 178, 186, 200, 222, 235, 244, 249, 253, 256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = {
-  0,   1,   2,   4,   6,   9,   12,  17,  22,  29,  36,  43,  54,  64,  76,
-  86,  3,   5,   7,   11,  15,  19,  25,  32,  38,  48,  59,  68,  84,  99,
-  115, 130, 8,   10,  13,  18,  23,  27,  33,  42,  51,  60,  72,  88,  103,
-  119, 142, 167, 14,  16,  20,  26,  31,  37,  44,  53,  61,  73,  85,  100,
-  116, 135, 161, 185, 21,  24,  30,  35,  40,  47,  55,  65,  74,  81,  94,
-  112, 133, 154, 179, 205, 28,  34,  39,  45,  50,  58,  67,  77,  87,  96,
-  106, 121, 146, 169, 196, 212, 41,  46,  49,  56,  63,  70,  79,  90,  98,
-  107, 122, 138, 159, 182, 207, 222, 52,  57,  62,  69,  75,  83,  93,  102,
-  110, 120, 134, 150, 176, 195, 215, 226, 66,  71,  78,  82,  91,  97,  108,
-  113, 127, 136, 148, 168, 188, 202, 221, 232, 80,  89,  92,  101, 105, 114,
-  125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95,  104, 109, 117, 123,
-  128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129,
-  140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137,
-  145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149,
-  156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152,
-  163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253,
-  158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254,
-  255,
+  1,   2,   3,   5,   7,   10,  13,  18,  23,  30,  37,  44,  55,  65,  77,
+  87,  4,   6,   8,   12,  16,  20,  26,  33,  39,  49,  60,  69,  85,  100,
+  116, 131, 9,   11,  14,  19,  24,  28,  34,  43,  52,  61,  73,  89,  104,
+  120, 143, 168, 15,  17,  21,  27,  32,  38,  45,  54,  62,  74,  86,  101,
+  117, 136, 162, 186, 22,  25,  31,  36,  41,  48,  56,  66,  75,  82,  95,
+  113, 134, 155, 180, 206, 29,  35,  40,  46,  51,  59,  68,  78,  88,  97,
+  107, 122, 147, 170, 197, 213, 42,  47,  50,  57,  64,  71,  80,  91,  99,
+  108, 123, 139, 160, 183, 208, 223, 53,  58,  63,  70,  76,  84,  94,  103,
+  111, 121, 135, 151, 177, 196, 216, 227, 67,  72,  79,  83,  92,  98,  109,
+  114, 128, 137, 149, 169, 189, 203, 222, 233, 81,  90,  93,  102, 106, 115,
+  126, 132, 140, 152, 163, 178, 193, 209, 224, 235, 96,  105, 110, 118, 124,
+  129, 144, 145, 156, 166, 176, 191, 207, 220, 234, 240, 112, 119, 125, 130,
+  141, 148, 158, 165, 171, 182, 192, 204, 225, 231, 241, 244, 127, 133, 138,
+  146, 154, 161, 175, 179, 185, 198, 205, 217, 232, 238, 245, 247, 142, 150,
+  157, 167, 173, 181, 190, 200, 201, 211, 221, 229, 239, 243, 250, 252, 153,
+  164, 172, 184, 187, 194, 202, 212, 215, 219, 228, 237, 246, 248, 253, 254,
+  159, 174, 188, 195, 199, 210, 214, 218, 226, 230, 236, 242, 249, 251, 255,
+  256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = {
-  0,   2,   5,   9,   17,  24,  36,  44,  55,  72,  88,  104, 128, 143, 166,
-  179, 1,   4,   8,   13,  20,  30,  40,  54,  66,  79,  96,  113, 141, 154,
-  178, 196, 3,   7,   11,  18,  25,  33,  46,  57,  71,  86,  101, 119, 148,
-  164, 186, 201, 6,   12,  16,  23,  31,  39,  53,  64,  78,  92,  110, 127,
-  153, 169, 193, 208, 10,  14,  19,  28,  37,  47,  58,  67,  84,  98,  114,
-  133, 161, 176, 198, 214, 15,  21,  26,  34,  43,  52,  65,  77,  91,  106,
-  120, 140, 165, 185, 205, 221, 22,  27,  32,  41,  48,  60,  73,  85,  99,
-  116, 130, 151, 175, 190, 211, 225, 29,  35,  42,  49,  59,  69,  81,  95,
-  108, 125, 139, 155, 182, 197, 217, 229, 38,  45,  51,  61,  68,  80,  93,
-  105, 118, 134, 150, 168, 191, 207, 223, 234, 50,  56,  63,  74,  83,  94,
-  109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62,  70,  76,  87,  97,
-  107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75,  82,  90,  102,
-  112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89,  100, 111,
-  123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115,
-  126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121,
-  135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254,
-  137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253,
-  255,
+  1,   3,   6,   10,  18,  25,  37,  45,  56,  73,  89,  105, 129, 144, 167,
+  180, 2,   5,   9,   14,  21,  31,  41,  55,  67,  80,  97,  114, 142, 155,
+  179, 197, 4,   8,   12,  19,  26,  34,  47,  58,  72,  87,  102, 120, 149,
+  165, 187, 202, 7,   13,  17,  24,  32,  40,  54,  65,  79,  93,  111, 128,
+  154, 170, 194, 209, 11,  15,  20,  29,  38,  48,  59,  68,  85,  99,  115,
+  134, 162, 177, 199, 215, 16,  22,  27,  35,  44,  53,  66,  78,  92,  107,
+  121, 141, 166, 186, 206, 222, 23,  28,  33,  42,  49,  61,  74,  86,  100,
+  117, 131, 152, 176, 191, 212, 226, 30,  36,  43,  50,  60,  70,  82,  96,
+  109, 126, 140, 156, 183, 198, 218, 230, 39,  46,  52,  62,  69,  81,  94,
+  106, 119, 135, 151, 169, 192, 208, 224, 235, 51,  57,  64,  75,  84,  95,
+  110, 118, 130, 148, 164, 178, 200, 214, 229, 239, 63,  71,  77,  88,  98,
+  108, 123, 132, 146, 160, 173, 189, 211, 223, 236, 243, 76,  83,  91,  103,
+  113, 125, 139, 147, 158, 174, 188, 203, 220, 231, 241, 246, 90,  101, 112,
+  124, 133, 143, 157, 168, 181, 190, 204, 217, 232, 238, 247, 251, 104, 116,
+  127, 137, 150, 163, 172, 184, 195, 205, 216, 225, 237, 242, 249, 253, 122,
+  136, 145, 159, 171, 182, 193, 201, 210, 219, 228, 234, 244, 245, 252, 255,
+  138, 153, 161, 175, 185, 196, 207, 213, 221, 227, 233, 240, 248, 250, 254,
+  256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
-  0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
-  170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
-  377,  405,  455,  471,  495,  527,  1,    4,    8,    15,   22,   30,   45,
-  58,   74,   92,   112,  133,  158,  184,  203,  215,  222,  228,  234,  237,
-  256,  274,  298,  317,  355,  376,  404,  426,  470,  494,  526,  551,  3,
-  7,    12,   18,   28,   36,   52,   64,   82,   102,  118,  142,  164,  189,
-  208,  217,  224,  231,  235,  238,  273,  297,  316,  329,  375,  403,  425,
-  440,  493,  525,  550,  567,  6,    11,   16,   23,   31,   43,   60,   73,
-  90,   109,  126,  150,  173,  196,  211,  220,  226,  232,  236,  239,  296,
-  315,  328,  335,  402,  424,  439,  447,  524,  549,  566,  575,  9,    14,
-  19,   29,   37,   50,   65,   78,   95,   116,  134,  157,  179,  201,  214,
-  223,  244,  255,  272,  295,  341,  354,  374,  401,  454,  469,  492,  523,
-  582,  596,  617,  645,  13,   20,   26,   35,   44,   54,   72,   85,   105,
-  123,  140,  163,  182,  205,  216,  225,  254,  271,  294,  314,  353,  373,
-  400,  423,  468,  491,  522,  548,  595,  616,  644,  666,  21,   27,   33,
-  42,   53,   63,   80,   94,   113,  132,  151,  172,  190,  209,  218,  227,
-  270,  293,  313,  327,  372,  399,  422,  438,  490,  521,  547,  565,  615,
-  643,  665,  680,  24,   32,   39,   48,   57,   71,   88,   104,  120,  139,
-  159,  178,  197,  212,  221,  230,  292,  312,  326,  334,  398,  421,  437,
-  446,  520,  546,  564,  574,  642,  664,  679,  687,  34,   40,   46,   56,
-  68,   81,   96,   111,  130,  147,  167,  186,  243,  253,  269,  291,  340,
-  352,  371,  397,  453,  467,  489,  519,  581,  594,  614,  641,  693,  705,
-  723,  747,  41,   49,   55,   67,   77,   91,   107,  124,  138,  161,  177,
-  194,  252,  268,  290,  311,  351,  370,  396,  420,  466,  488,  518,  545,
-  593,  613,  640,  663,  704,  722,  746,  765,  51,   59,   66,   76,   89,
-  99,   119,  131,  149,  168,  181,  200,  267,  289,  310,  325,  369,  395,
-  419,  436,  487,  517,  544,  563,  612,  639,  662,  678,  721,  745,  764,
-  777,  61,   69,   75,   87,   100,  114,  129,  144,  162,  180,  191,  207,
-  288,  309,  324,  333,  394,  418,  435,  445,  516,  543,  562,  573,  638,
-  661,  677,  686,  744,  763,  776,  783,  70,   79,   86,   97,   108,  122,
-  137,  155,  242,  251,  266,  287,  339,  350,  368,  393,  452,  465,  486,
-  515,  580,  592,  611,  637,  692,  703,  720,  743,  788,  798,  813,  833,
-  84,   93,   103,  110,  125,  141,  154,  171,  250,  265,  286,  308,  349,
-  367,  392,  417,  464,  485,  514,  542,  591,  610,  636,  660,  702,  719,
-  742,  762,  797,  812,  832,  848,  98,   106,  115,  127,  143,  156,  169,
-  185,  264,  285,  307,  323,  366,  391,  416,  434,  484,  513,  541,  561,
-  609,  635,  659,  676,  718,  741,  761,  775,  811,  831,  847,  858,  117,
-  128,  136,  148,  160,  175,  188,  198,  284,  306,  322,  332,  390,  415,
-  433,  444,  512,  540,  560,  572,  634,  658,  675,  685,  740,  760,  774,
-  782,  830,  846,  857,  863,  135,  146,  152,  165,  241,  249,  263,  283,
-  338,  348,  365,  389,  451,  463,  483,  511,  579,  590,  608,  633,  691,
-  701,  717,  739,  787,  796,  810,  829,  867,  875,  887,  903,  153,  166,
-  174,  183,  248,  262,  282,  305,  347,  364,  388,  414,  462,  482,  510,
-  539,  589,  607,  632,  657,  700,  716,  738,  759,  795,  809,  828,  845,
-  874,  886,  902,  915,  176,  187,  195,  202,  261,  281,  304,  321,  363,
-  387,  413,  432,  481,  509,  538,  559,  606,  631,  656,  674,  715,  737,
-  758,  773,  808,  827,  844,  856,  885,  901,  914,  923,  192,  199,  206,
-  213,  280,  303,  320,  331,  386,  412,  431,  443,  508,  537,  558,  571,
-  630,  655,  673,  684,  736,  757,  772,  781,  826,  843,  855,  862,  900,
-  913,  922,  927,  240,  247,  260,  279,  337,  346,  362,  385,  450,  461,
-  480,  507,  578,  588,  605,  629,  690,  699,  714,  735,  786,  794,  807,
-  825,  866,  873,  884,  899,  930,  936,  945,  957,  246,  259,  278,  302,
-  345,  361,  384,  411,  460,  479,  506,  536,  587,  604,  628,  654,  698,
-  713,  734,  756,  793,  806,  824,  842,  872,  883,  898,  912,  935,  944,
-  956,  966,  258,  277,  301,  319,  360,  383,  410,  430,  478,  505,  535,
-  557,  603,  627,  653,  672,  712,  733,  755,  771,  805,  823,  841,  854,
-  882,  897,  911,  921,  943,  955,  965,  972,  276,  300,  318,  330,  382,
-  409,  429,  442,  504,  534,  556,  570,  626,  652,  671,  683,  732,  754,
-  770,  780,  822,  840,  853,  861,  896,  910,  920,  926,  954,  964,  971,
-  975,  336,  344,  359,  381,  449,  459,  477,  503,  577,  586,  602,  625,
-  689,  697,  711,  731,  785,  792,  804,  821,  865,  871,  881,  895,  929,
-  934,  942,  953,  977,  981,  987,  995,  343,  358,  380,  408,  458,  476,
-  502,  533,  585,  601,  624,  651,  696,  710,  730,  753,  791,  803,  820,
-  839,  870,  880,  894,  909,  933,  941,  952,  963,  980,  986,  994,  1001,
-  357,  379,  407,  428,  475,  501,  532,  555,  600,  623,  650,  670,  709,
-  729,  752,  769,  802,  819,  838,  852,  879,  893,  908,  919,  940,  951,
-  962,  970,  985,  993,  1000, 1005, 378,  406,  427,  441,  500,  531,  554,
-  569,  622,  649,  669,  682,  728,  751,  768,  779,  818,  837,  851,  860,
-  892,  907,  918,  925,  950,  961,  969,  974,  992,  999,  1004, 1007, 448,
-  457,  474,  499,  576,  584,  599,  621,  688,  695,  708,  727,  784,  790,
-  801,  817,  864,  869,  878,  891,  928,  932,  939,  949,  976,  979,  984,
-  991,  1008, 1010, 1013, 1017, 456,  473,  498,  530,  583,  598,  620,  648,
-  694,  707,  726,  750,  789,  800,  816,  836,  868,  877,  890,  906,  931,
-  938,  948,  960,  978,  983,  990,  998,  1009, 1012, 1016, 1020, 472,  497,
-  529,  553,  597,  619,  647,  668,  706,  725,  749,  767,  799,  815,  835,
-  850,  876,  889,  905,  917,  937,  947,  959,  968,  982,  989,  997,  1003,
-  1011, 1015, 1019, 1022, 496,  528,  552,  568,  618,  646,  667,  681,  724,
-  748,  766,  778,  814,  834,  849,  859,  888,  904,  916,  924,  946,  958,
-  967,  973,  988,  996,  1002, 1006, 1014, 1018, 1021, 1023,
-};
-
-const scan_order vp9_default_scan_orders[TX_SIZES] = {
+  1,    3,    6,    11,   18,   26,   39,   48,   63,   84,   102,  122,  146,
+  171,  194,  205,  211,  220,  230,  234,  246,  258,  276,  300,  343,  357,
+  378,  406,  456,  472,  496,  528,  2,    5,    9,    16,   23,   31,   46,
+  59,   75,   93,   113,  134,  159,  185,  204,  216,  223,  229,  235,  238,
+  257,  275,  299,  318,  356,  377,  405,  427,  471,  495,  527,  552,  4,
+  8,    13,   19,   29,   37,   53,   65,   83,   103,  119,  143,  165,  190,
+  209,  218,  225,  232,  236,  239,  274,  298,  317,  330,  376,  404,  426,
+  441,  494,  526,  551,  568,  7,    12,   17,   24,   32,   44,   61,   74,
+  91,   110,  127,  151,  174,  197,  212,  221,  227,  233,  237,  240,  297,
+  316,  329,  336,  403,  425,  440,  448,  525,  550,  567,  576,  10,   15,
+  20,   30,   38,   51,   66,   79,   96,   117,  135,  158,  180,  202,  215,
+  224,  245,  256,  273,  296,  342,  355,  375,  402,  455,  470,  493,  524,
+  583,  597,  618,  646,  14,   21,   27,   36,   45,   55,   73,   86,   106,
+  124,  141,  164,  183,  206,  217,  226,  255,  272,  295,  315,  354,  374,
+  401,  424,  469,  492,  523,  549,  596,  617,  645,  667,  22,   28,   34,
+  43,   54,   64,   81,   95,   114,  133,  152,  173,  191,  210,  219,  228,
+  271,  294,  314,  328,  373,  400,  423,  439,  491,  522,  548,  566,  616,
+  644,  666,  681,  25,   33,   40,   49,   58,   72,   89,   105,  121,  140,
+  160,  179,  198,  213,  222,  231,  293,  313,  327,  335,  399,  422,  438,
+  447,  521,  547,  565,  575,  643,  665,  680,  688,  35,   41,   47,   57,
+  69,   82,   97,   112,  131,  148,  168,  187,  244,  254,  270,  292,  341,
+  353,  372,  398,  454,  468,  490,  520,  582,  595,  615,  642,  694,  706,
+  724,  748,  42,   50,   56,   68,   78,   92,   108,  125,  139,  162,  178,
+  195,  253,  269,  291,  312,  352,  371,  397,  421,  467,  489,  519,  546,
+  594,  614,  641,  664,  705,  723,  747,  766,  52,   60,   67,   77,   90,
+  100,  120,  132,  150,  169,  182,  201,  268,  290,  311,  326,  370,  396,
+  420,  437,  488,  518,  545,  564,  613,  640,  663,  679,  722,  746,  765,
+  778,  62,   70,   76,   88,   101,  115,  130,  145,  163,  181,  192,  208,
+  289,  310,  325,  334,  395,  419,  436,  446,  517,  544,  563,  574,  639,
+  662,  678,  687,  745,  764,  777,  784,  71,   80,   87,   98,   109,  123,
+  138,  156,  243,  252,  267,  288,  340,  351,  369,  394,  453,  466,  487,
+  516,  581,  593,  612,  638,  693,  704,  721,  744,  789,  799,  814,  834,
+  85,   94,   104,  111,  126,  142,  155,  172,  251,  266,  287,  309,  350,
+  368,  393,  418,  465,  486,  515,  543,  592,  611,  637,  661,  703,  720,
+  743,  763,  798,  813,  833,  849,  99,   107,  116,  128,  144,  157,  170,
+  186,  265,  286,  308,  324,  367,  392,  417,  435,  485,  514,  542,  562,
+  610,  636,  660,  677,  719,  742,  762,  776,  812,  832,  848,  859,  118,
+  129,  137,  149,  161,  176,  189,  199,  285,  307,  323,  333,  391,  416,
+  434,  445,  513,  541,  561,  573,  635,  659,  676,  686,  741,  761,  775,
+  783,  831,  847,  858,  864,  136,  147,  153,  166,  242,  250,  264,  284,
+  339,  349,  366,  390,  452,  464,  484,  512,  580,  591,  609,  634,  692,
+  702,  718,  740,  788,  797,  811,  830,  868,  876,  888,  904,  154,  167,
+  175,  184,  249,  263,  283,  306,  348,  365,  389,  415,  463,  483,  511,
+  540,  590,  608,  633,  658,  701,  717,  739,  760,  796,  810,  829,  846,
+  875,  887,  903,  916,  177,  188,  196,  203,  262,  282,  305,  322,  364,
+  388,  414,  433,  482,  510,  539,  560,  607,  632,  657,  675,  716,  738,
+  759,  774,  809,  828,  845,  857,  886,  902,  915,  924,  193,  200,  207,
+  214,  281,  304,  321,  332,  387,  413,  432,  444,  509,  538,  559,  572,
+  631,  656,  674,  685,  737,  758,  773,  782,  827,  844,  856,  863,  901,
+  914,  923,  928,  241,  248,  261,  280,  338,  347,  363,  386,  451,  462,
+  481,  508,  579,  589,  606,  630,  691,  700,  715,  736,  787,  795,  808,
+  826,  867,  874,  885,  900,  931,  937,  946,  958,  247,  260,  279,  303,
+  346,  362,  385,  412,  461,  480,  507,  537,  588,  605,  629,  655,  699,
+  714,  735,  757,  794,  807,  825,  843,  873,  884,  899,  913,  936,  945,
+  957,  967,  259,  278,  302,  320,  361,  384,  411,  431,  479,  506,  536,
+  558,  604,  628,  654,  673,  713,  734,  756,  772,  806,  824,  842,  855,
+  883,  898,  912,  922,  944,  956,  966,  973,  277,  301,  319,  331,  383,
+  410,  430,  443,  505,  535,  557,  571,  627,  653,  672,  684,  733,  755,
+  771,  781,  823,  841,  854,  862,  897,  911,  921,  927,  955,  965,  972,
+  976,  337,  345,  360,  382,  450,  460,  478,  504,  578,  587,  603,  626,
+  690,  698,  712,  732,  786,  793,  805,  822,  866,  872,  882,  896,  930,
+  935,  943,  954,  978,  982,  988,  996,  344,  359,  381,  409,  459,  477,
+  503,  534,  586,  602,  625,  652,  697,  711,  731,  754,  792,  804,  821,
+  840,  871,  881,  895,  910,  934,  942,  953,  964,  981,  987,  995,  1002,
+  358,  380,  408,  429,  476,  502,  533,  556,  601,  624,  651,  671,  710,
+  730,  753,  770,  803,  820,  839,  853,  880,  894,  909,  920,  941,  952,
+  963,  971,  986,  994,  1001, 1006, 379,  407,  428,  442,  501,  532,  555,
+  570,  623,  650,  670,  683,  729,  752,  769,  780,  819,  838,  852,  861,
+  893,  908,  919,  926,  951,  962,  970,  975,  993,  1000, 1005, 1008, 449,
+  458,  475,  500,  577,  585,  600,  622,  689,  696,  709,  728,  785,  791,
+  802,  818,  865,  870,  879,  892,  929,  933,  940,  950,  977,  980,  985,
+  992,  1009, 1011, 1014, 1018, 457,  474,  499,  531,  584,  599,  621,  649,
+  695,  708,  727,  751,  790,  801,  817,  837,  869,  878,  891,  907,  932,
+  939,  949,  961,  979,  984,  991,  999,  1010, 1013, 1017, 1021, 473,  498,
+  530,  554,  598,  620,  648,  669,  707,  726,  750,  768,  800,  816,  836,
+  851,  877,  890,  906,  918,  938,  948,  960,  969,  983,  990,  998,  1004,
+  1012, 1016, 1020, 1023, 497,  529,  553,  569,  619,  647,  668,  682,  725,
+  749,  767,  779,  815,  835,  850,  860,  889,  905,  917,  925,  947,  959,
+  968,  974,  989,  997,  1003, 1007, 1015, 1019, 1022, 1024,
+};
+
+const ScanOrder vp9_default_scan_orders[TX_SIZES] = {
   { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
   { default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors },
   { default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors },
   { default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors },
 };
 
-const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = {
+const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES] = {
   { // TX_4X4
     { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
     { row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors },
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index 72a9a5ec47..3d1dcc66da 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -23,14 +23,14 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-typedef struct {
+typedef struct ScanOrder {
   const int16_t *scan;
   const int16_t *iscan;
   const int16_t *neighbors;
-} scan_order;
+} ScanOrder;
 
-extern const scan_order vp9_default_scan_orders[TX_SIZES];
-extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES];
+extern const ScanOrder vp9_default_scan_orders[TX_SIZES];
+extern const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES];
 
 static INLINE int get_coef_context(const int16_t *neighbors,
                                    const uint8_t *token_cache, int c) {
@@ -39,8 +39,8 @@ static INLINE int get_coef_context(const int16_t *neighbors,
          1;
 }
 
-static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                                         PLANE_TYPE type, int block_idx) {
+static INLINE const ScanOrder *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                        PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi[0];
 
   if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) {
diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index b63e4f4999..5e71c2fca5 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -25,6 +25,11 @@ extern "C" {
 
 #define PREDICTION_PROBS 3
 
+// Segment ID used to skip background encoding
+#define BACKGROUND_SEG_SKIP_ID 3
+// Number of frames that don't skip after a key frame
+#define FRAMES_NO_SKIPPING_AFTER_KEY 20
+
 // Segment level features.
 typedef enum {
   SEG_LVL_ALT_Q = 0,      // Use alternate Quantizer ....
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index b3d50162b2..8df18af3b8 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -283,7 +283,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, lf_sync->mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->mutex,
                     vpx_malloc(sizeof(*lf_sync->mutex) * rows));
     if (lf_sync->mutex) {
       for (i = 0; i < rows; ++i) {
@@ -291,7 +291,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->cond,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->cond,
                     vpx_malloc(sizeof(*lf_sync->cond) * rows));
     if (lf_sync->cond) {
       for (i = 0; i < rows; ++i) {
@@ -299,23 +299,21 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->lf_mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex,
                     vpx_malloc(sizeof(*lf_sync->lf_mutex)));
     pthread_mutex_init(lf_sync->lf_mutex, NULL);
 
-    CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex,
                     vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
     if (lf_sync->recon_done_mutex) {
-      int i;
       for (i = 0; i < rows; ++i) {
         pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond,
                     vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
     if (lf_sync->recon_done_cond) {
-      int i;
       for (i = 0; i < rows; ++i) {
         pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
       }
@@ -323,15 +321,15 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata,
                   vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
   lf_sync->num_workers = num_workers;
   lf_sync->num_active_workers = lf_sync->num_workers;
 
-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col,
                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
 
-  CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done,
                   vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
                                  mi_cols_aligned_to_sb(cm->mi_rows) >>
                              MI_BLOCK_SIZE_LOG2));
@@ -390,10 +388,10 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
 
 static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
   int return_val = -1;
-  int cur_row;
   const int max_rows = cm->mi_rows;
 
 #if CONFIG_MULTITHREAD
+  int cur_row;
   const int tile_cols = 1 << cm->log2_tile_cols;
 
   pthread_mutex_lock(lf_sync->lf_mutex);
@@ -430,14 +428,8 @@ static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
 #else
   (void)lf_sync;
   if (cm->lf_row < max_rows) {
-    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
     return_val = cm->lf_row;
     cm->lf_row += MI_BLOCK_SIZE;
-    if (cm->lf_row < max_rows) {
-      /* If this is not the last row, make sure the next row is also decoded.
-       * This is because the intra predict has to happen before loop filter */
-      cur_row += 1;
-    }
   }
 #endif  // CONFIG_MULTITHREAD
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 2a27e6fdb3..c5892156f4 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -323,9 +323,9 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd,
   if (!mi->skip) {
     const TX_TYPE tx_type =
         (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
-    const scan_order *sc = (plane || xd->lossless)
-                               ? &vp9_default_scan_orders[tx_size]
-                               : &vp9_scan_orders[tx_size][tx_type];
+    const ScanOrder *sc = (plane || xd->lossless)
+                              ? &vp9_default_scan_orders[tx_size]
+                              : &vp9_scan_orders[tx_size][tx_type];
     const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                             mi->segment_id);
     if (eob > 0) {
@@ -348,9 +348,9 @@ static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
     struct macroblockd_plane *const pd = &xd->plane[plane];
     const TX_TYPE tx_type =
         (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
-    const scan_order *sc = (plane || xd->lossless)
-                               ? &vp9_default_scan_orders[tx_size]
-                               : &vp9_scan_orders[tx_size][tx_type];
+    const ScanOrder *sc = (plane || xd->lossless)
+                              ? &vp9_default_scan_orders[tx_size]
+                              : &vp9_scan_orders[tx_size][tx_type];
     *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                        mi->segment_id);
     /* Keep the alignment to 16 */
@@ -393,7 +393,7 @@ static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi,
                                    int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
   const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                           mi->segment_id);
   uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
@@ -423,7 +423,7 @@ static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
                                     TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
   const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                           mi->segment_id);
 
@@ -1469,7 +1469,7 @@ static void resize_mv_buffer(VP9_COMMON *cm) {
   vpx_free(cm->cur_frame->mvs);
   cm->cur_frame->mi_rows = cm->mi_rows;
   cm->cur_frame->mi_cols = cm->mi_cols;
-  CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+  CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs,
                   (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
                                        sizeof(*cm->cur_frame->mvs)));
 }
@@ -1776,7 +1776,8 @@ static void vp9_jobq_alloc(VP9Decoder *pbi) {
 
   if (jobq_size > row_mt_worker_data->jobq_size) {
     vpx_free(row_mt_worker_data->jobq_buf);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size));
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf,
+                    vpx_calloc(1, jobq_size));
     vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
                   jobq_size);
     row_mt_worker_data->jobq_size = jobq_size;
@@ -1923,7 +1924,7 @@ static int row_decode_worker_hook(void *arg1, void *arg2) {
       const int is_last_row = sb_rows - 1 == cur_sb_row;
       int mi_col_start, mi_col_end;
       if (!tile_data_recon)
-        CHECK_MEM_ERROR(cm, tile_data_recon,
+        CHECK_MEM_ERROR(&cm->error, tile_data_recon,
                         vpx_memalign(32, sizeof(TileWorkerData)));
 
       tile_data_recon->xd = pbi->mb;
@@ -2025,7 +2026,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
 
   if (cm->lf.filter_level && !cm->skip_loop_filter &&
       pbi->lf_worker.data1 == NULL) {
-    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+    CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
     pbi->lf_worker.hook = vp9_loop_filter_worker;
     if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
@@ -2192,8 +2193,6 @@ static int tile_worker_hook(void *arg1, void *arg2) {
 
   volatile int mi_row = 0;
   volatile int n = tile_data->buf_start;
-  tile_data->error_info.setjmp = 1;
-
   if (setjmp(tile_data->error_info.jmp)) {
     tile_data->error_info.setjmp = 0;
     tile_data->xd.corrupted = 1;
@@ -2206,6 +2205,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     }
     return 0;
   }
+  tile_data->error_info.setjmp = 1;
 
   tile_data->xd.corrupted = 0;
 
@@ -2285,7 +2285,7 @@ static INLINE void init_mt(VP9Decoder *pbi) {
 
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads;
-    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
     for (n = 0; n < num_threads; ++n) {
       VPxWorker *const worker = &pbi->tile_workers[n];
@@ -2293,6 +2293,11 @@ static INLINE void init_mt(VP9Decoder *pbi) {
 
       winterface->init(worker);
       if (n < num_threads - 1 && !winterface->reset(worker)) {
+        do {
+          winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]);
+        } while (--pbi->num_tile_workers != 0);
+        vpx_free(pbi->tile_workers);
+        pbi->tile_workers = NULL;
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
@@ -2824,7 +2829,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     const int num_jobs = sb_rows << cm->log2_tile_cols;
 
     if (pbi->row_mt_worker_data == NULL) {
-      CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
+      CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data,
                       vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
 #if CONFIG_MULTITHREAD
       pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL);
@@ -3006,7 +3011,8 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
     // platforms without DECLARE_ALIGNED().
     assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
     vpx_free(pbi->tile_worker_data);
-    CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size));
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data,
+                    vpx_memalign(32, twd_size));
     pbi->total_tiles = tile_rows * tile_cols;
   }
 
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8a8d2ad86e..0989cde58d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -204,7 +204,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
   mi->skip = read_skip(cm, xd, mi->segment_id, r);
   mi->tx_size = read_tx_size(cm, xd, 1, r);
   mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
 
   switch (bsize) {
     case BLOCK_4X4:
@@ -299,7 +299,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
   }
 }
 
-// Read the referncence frame
+// Read the reference frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             vpx_reader *r, int segment_id,
                             MV_REFERENCE_FRAME ref_frame[2]) {
@@ -309,7 +309,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
                                                    SEG_LVL_REF_FRAME);
-    ref_frame[1] = NONE;
+    ref_frame[1] = NO_REF_FRAME;
   } else {
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
@@ -333,7 +333,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
         ref_frame[0] = LAST_FRAME;
       }
 
-      ref_frame[1] = NONE;
+      ref_frame[1] = NO_REF_FRAME;
     } else {
       assert(0 && "Invalid prediction mode.");
     }
@@ -383,7 +383,7 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm,
   mi->interp_filter = SWITCHABLE_FILTERS;
 
   mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -426,7 +426,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd,
       zero_mv_pair(mv);
       break;
     }
-    default: { return 0; }
+    default: {
+      return 0;
+    }
   }
   return ret;
 }
@@ -706,7 +708,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     mi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
       vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                         "Invalid usage of segement feature on small blocks");
+                         "Invalid usage of segment feature on small blocks");
       return;
     }
   } else {
@@ -755,7 +757,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
         if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs,
                        best_sub8x8, is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
-          break;
+          return;
         }
 
         if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 7db8ed72d5..5a7e9f9ab3 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -66,7 +66,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
   {
     int i;
     CHECK_MEM_ERROR(
-        cm, row_mt_worker_data->recon_sync_mutex,
+        &cm->error, row_mt_worker_data->recon_sync_mutex,
         vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs));
     if (row_mt_worker_data->recon_sync_mutex) {
       for (i = 0; i < num_jobs; ++i) {
@@ -75,7 +75,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
     }
 
     CHECK_MEM_ERROR(
-        cm, row_mt_worker_data->recon_sync_cond,
+        &cm->error, row_mt_worker_data->recon_sync_cond,
         vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs));
     if (row_mt_worker_data->recon_sync_cond) {
       for (i = 0; i < num_jobs; ++i) {
@@ -86,24 +86,24 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
 #endif
   row_mt_worker_data->num_sbs = num_sbs;
   for (plane = 0; plane < 3; ++plane) {
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
-                    vpx_memalign(16, dqcoeff_size));
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane],
+                    vpx_memalign(32, dqcoeff_size));
     memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane],
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane],
                     vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
                                sizeof(*row_mt_worker_data->eob[plane])));
   }
-  CHECK_MEM_ERROR(cm, row_mt_worker_data->partition,
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition,
                   vpx_calloc(num_sbs * PARTITIONS_PER_SB,
                              sizeof(*row_mt_worker_data->partition)));
-  CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map,
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map,
                   vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
 
   // allocate memory for thread_data
   if (row_mt_worker_data->thread_data == NULL) {
     const size_t thread_size =
         max_threads * sizeof(*row_mt_worker_data->thread_data);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data,
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data,
                     vpx_memalign(32, thread_size));
   }
 }
@@ -181,9 +181,10 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
 
   cm->error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
   pbi->need_resync = 1;
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index b0ef83c73d..2e198d552e 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -54,7 +54,7 @@ typedef struct TileWorkerData {
   VP9LfSync *lf_sync;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
   DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index c2e6b3d545..d957dc34e3 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -133,17 +133,18 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
   int16_t dqv = dq[0];
   const uint8_t *const cat6_prob =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (xd->bd == VPX_BITS_12)
-          ? vp9_cat6_prob_high12
-          : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 :
+      (xd->bd == VPX_BITS_12)   ? vp9_cat6_prob_high12
+      : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2
+                                :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                    vp9_cat6_prob;
+                                vp9_cat6_prob;
   const int cat6_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (xd->bd == VPX_BITS_12) ? 18
-                              : (xd->bd == VPX_BITS_10) ? 16 :
+      (xd->bd == VPX_BITS_12)   ? 18
+      : (xd->bd == VPX_BITS_10) ? 16
+                                :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                                        14;
+                                14;
   // Keep value, range, and count as locals.  The compiler produces better
   // results with the locals than using r directly.
   BD_VALUE value = r->value;
@@ -271,9 +272,8 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l,
   }
 }
 
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
-                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
-                            int seg_id) {
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+                            int x, int y, TX_SIZE tx_size, int seg_id) {
   vpx_reader *r = &twd->bit_reader;
   MACROBLOCKD *xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index a32052ffff..a8e47021b8 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -19,9 +19,8 @@
 extern "C" {
 #endif
 
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
-                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
-                            int seg_id);
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+                            int x, int y, TX_SIZE tx_size, int seg_id);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 0000000000..997b5477e1
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,2173 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
+                                   int stride) {
+  // { 0, 1, 1, 1 };
+  const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3);
+  // { 1, 0, 0, 0 };
+  const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3);
+  int16x4_t mask;
+
+  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+  // one non-zero first elements
+  mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a));
+  input_0 = vadd_s16(input_0, mask);
+  input_0 = vadd_s16(input_0, nonzero_bias_b);
+
+  in[0] = vcombine_s16(input_0, input_1);
+  in[1] = vcombine_s16(input_2, input_3);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
+  const int16x8_t one_s16 = vdupq_n_s16(1);
+  res[0] = vaddq_s16(res[0], one_s16);
+  res[1] = vaddq_s16(res[1], one_s16);
+  res[0] = vshrq_n_s16(res[0], 2);
+  res[1] = vshrq_n_s16(res[1], 2);
+  store_s16q_to_tran_low(output + 0 * 8, res[0]);
+  store_s16q_to_tran_low(output + 1 * 8, res[1]);
+}
+
+static INLINE void fadst4x4_neon(int16x8_t *in) {
+  int32x4_t u[4], t[4];
+  int16x4_t s[4], out[4];
+
+  s[0] = vget_low_s16(in[0]);   // | x_00 | x_01 | x_02 | x_03 |
+  s[1] = vget_high_s16(in[0]);  // | x_10 | x_11 | x_12 | x_13 |
+  s[2] = vget_low_s16(in[1]);   // | x_20 | x_21 | x_22 | x_23 |
+  s[3] = vget_high_s16(in[1]);  // | x_30 | x_31 | x_32 | x_33 |
+
+  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+  // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+  t[0] = vmull_n_s16(s[0], sinpi_1_9);
+  t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9);
+  t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9);
+
+  // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+  t[1] = vmull_n_s16(s[0], sinpi_3_9);
+  t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9);
+  t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9);
+
+  // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+  t[2] = vmull_n_s16(s[0], sinpi_4_9);
+  t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9);
+  t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9);
+
+  // t3 = s2 * sinpi_3_9
+  t[3] = vmull_n_s16(s[2], sinpi_3_9);
+
+  /*
+   * u0 = t0 + t3
+   * u1 = t1
+   * u2 = t2 - t3
+   * u3 = t2 - t0 + t3
+   */
+  u[0] = vaddq_s32(t[0], t[3]);
+  u[1] = t[1];
+  u[2] = vsubq_s32(t[2], t[3]);
+  u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]);
+
+  // fdct_round_shift
+  out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS);
+  out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS);
+  out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS);
+  out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = vcombine_s16(out[0], out[1]);
+  in[1] = vcombine_s16(out[2], out[3]);
+}
+
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  int16x8_t in[2];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct4x4_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
+      fadst4x4_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
+      fadst4x4_neon(in);
+      write_buffer_4x4(output, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_4x4(input, in, stride);
+      fadst4x4_neon(in);
+      fadst4x4_neon(in);
+      write_buffer_4x4(output, in);
+      break;
+  }
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, int16x8_t *in,
+                                   int stride) {
+  in[0] = vshlq_n_s16(vld1q_s16(input + 0 * stride), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(input + 1 * stride), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(input + 2 * stride), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(input + 3 * stride), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(input + 4 * stride), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(input + 5 * stride), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(input + 6 * stride), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(input + 7 * stride), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void right_shift_8x8(int16x8_t *res, const int bit) {
+  int16x8_t sign0 = vshrq_n_s16(res[0], 15);
+  int16x8_t sign1 = vshrq_n_s16(res[1], 15);
+  int16x8_t sign2 = vshrq_n_s16(res[2], 15);
+  int16x8_t sign3 = vshrq_n_s16(res[3], 15);
+  int16x8_t sign4 = vshrq_n_s16(res[4], 15);
+  int16x8_t sign5 = vshrq_n_s16(res[5], 15);
+  int16x8_t sign6 = vshrq_n_s16(res[6], 15);
+  int16x8_t sign7 = vshrq_n_s16(res[7], 15);
+
+  if (bit == 2) {
+    const int16x8_t const_rounding = vdupq_n_s16(1);
+    res[0] = vaddq_s16(res[0], const_rounding);
+    res[1] = vaddq_s16(res[1], const_rounding);
+    res[2] = vaddq_s16(res[2], const_rounding);
+    res[3] = vaddq_s16(res[3], const_rounding);
+    res[4] = vaddq_s16(res[4], const_rounding);
+    res[5] = vaddq_s16(res[5], const_rounding);
+    res[6] = vaddq_s16(res[6], const_rounding);
+    res[7] = vaddq_s16(res[7], const_rounding);
+  }
+
+  res[0] = vsubq_s16(res[0], sign0);
+  res[1] = vsubq_s16(res[1], sign1);
+  res[2] = vsubq_s16(res[2], sign2);
+  res[3] = vsubq_s16(res[3], sign3);
+  res[4] = vsubq_s16(res[4], sign4);
+  res[5] = vsubq_s16(res[5], sign5);
+  res[6] = vsubq_s16(res[6], sign6);
+  res[7] = vsubq_s16(res[7], sign7);
+
+  if (bit == 1) {
+    res[0] = vshrq_n_s16(res[0], 1);
+    res[1] = vshrq_n_s16(res[1], 1);
+    res[2] = vshrq_n_s16(res[2], 1);
+    res[3] = vshrq_n_s16(res[3], 1);
+    res[4] = vshrq_n_s16(res[4], 1);
+    res[5] = vshrq_n_s16(res[5], 1);
+    res[6] = vshrq_n_s16(res[6], 1);
+    res[7] = vshrq_n_s16(res[7], 1);
+  } else {
+    res[0] = vshrq_n_s16(res[0], 2);
+    res[1] = vshrq_n_s16(res[1], 2);
+    res[2] = vshrq_n_s16(res[2], 2);
+    res[3] = vshrq_n_s16(res[3], 2);
+    res[4] = vshrq_n_s16(res[4], 2);
+    res[5] = vshrq_n_s16(res[5], 2);
+    res[6] = vshrq_n_s16(res[6], 2);
+    res[7] = vshrq_n_s16(res[7], 2);
+  }
+}
+
+static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res,
+                                    int stride) {
+  store_s16q_to_tran_low(output + 0 * stride, res[0]);
+  store_s16q_to_tran_low(output + 1 * stride, res[1]);
+  store_s16q_to_tran_low(output + 2 * stride, res[2]);
+  store_s16q_to_tran_low(output + 3 * stride, res[3]);
+  store_s16q_to_tran_low(output + 4 * stride, res[4]);
+  store_s16q_to_tran_low(output + 5 * stride, res[5]);
+  store_s16q_to_tran_low(output + 6 * stride, res[6]);
+  store_s16q_to_tran_low(output + 7 * stride, res[7]);
+}
+
+static INLINE void fadst8x8_neon(int16x8_t *in) {
+  int16x4_t x_lo[8], x_hi[8];
+  int32x4_t s_lo[8], s_hi[8];
+  int32x4_t t_lo[8], t_hi[8];
+
+  x_lo[0] = vget_low_s16(in[7]);
+  x_hi[0] = vget_high_s16(in[7]);
+  x_lo[1] = vget_low_s16(in[0]);
+  x_hi[1] = vget_high_s16(in[0]);
+  x_lo[2] = vget_low_s16(in[5]);
+  x_hi[2] = vget_high_s16(in[5]);
+  x_lo[3] = vget_low_s16(in[2]);
+  x_hi[3] = vget_high_s16(in[2]);
+  x_lo[4] = vget_low_s16(in[3]);
+  x_hi[4] = vget_high_s16(in[3]);
+  x_lo[5] = vget_low_s16(in[4]);
+  x_hi[5] = vget_high_s16(in[4]);
+  x_lo[6] = vget_low_s16(in[1]);
+  x_hi[6] = vget_high_s16(in[1]);
+  x_lo[7] = vget_low_s16(in[6]);
+  x_hi[7] = vget_high_s16(in[6]);
+
+  // stage 1
+  // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+  butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+                                      cospi_2_64, cospi_30_64, &s_lo[0],
+                                      &s_hi[0], &s_lo[1], &s_hi[1]);
+
+  // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+                                      cospi_10_64, cospi_22_64, &s_lo[2],
+                                      &s_hi[2], &s_lo[3], &s_hi[3]);
+
+  // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+                                      cospi_18_64, cospi_14_64, &s_lo[4],
+                                      &s_hi[4], &s_lo[5], &s_hi[5]);
+
+  // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+                                      cospi_26_64, cospi_6_64, &s_lo[6],
+                                      &s_hi[6], &s_lo[7], &s_hi[7]);
+
+  // fdct_round_shift
+  t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+  t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+  t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+  t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+  t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+  t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+  t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+  t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+                                  cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+                                  &s_lo[5], &s_hi[5]);
+
+  // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+                                  -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6],
+                                  &s_lo[7], &s_hi[7]);
+
+  // fdct_round_shift
+  // s0 + s2
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
+  // s0 - s2
+  t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+  t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+  t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
+  // s4 + s6
+  t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
+  // s5 + s7
+  t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
+  // s4 - s6
+  t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
+  // s5 - s7
+  t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
+
+  // stage 3
+  // cospi_16_64 * (x2 + x3)
+  // cospi_16_64 * (x2 - x3)
+  butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3],
+                                  cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+                                  &s_hi[3]);
+
+  // cospi_16_64 * (x6 + x7)
+  // cospi_16_64 * (x2 - x3)
+  butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+                                  cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+                                  &s_hi[7]);
+
+  // final fdct_round_shift
+  x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
+
+  // x0, x1, x4, x5 narrow down to 16-bits directly
+  x_lo[0] = vmovn_s32(t_lo[0]);
+  x_hi[0] = vmovn_s32(t_hi[0]);
+  x_lo[1] = vmovn_s32(t_lo[1]);
+  x_hi[1] = vmovn_s32(t_hi[1]);
+  x_lo[4] = vmovn_s32(t_lo[4]);
+  x_hi[4] = vmovn_s32(t_hi[4]);
+  x_lo[5] = vmovn_s32(t_lo[5]);
+  x_hi[5] = vmovn_s32(t_hi[5]);
+
+  in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+  in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+  in[2] = vcombine_s16(x_lo[6], x_hi[6]);
+  in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2]));
+  in[4] = vcombine_s16(x_lo[3], x_hi[3]);
+  in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7]));
+  in[6] = vcombine_s16(x_lo[5], x_hi[5]);
+  in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
+
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+}
+
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  int16x8_t in[8];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct8x8_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
+      fadst8x8_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
+      fadst8x8_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_8x8(input, in, stride);
+      fadst8x8_neon(in);
+      fadst8x8_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+  }
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int16x8_t *in0,
+                                     int16x8_t *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8(input, in0, stride);
+  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8(input, in1, stride);
+  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, int16x8_t *in0,
+                                      int16x8_t *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void right_shift_16x16(int16x8_t *res0, int16x8_t *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(int16x8_t *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  int16x8_t i[8], s1[8], s2[8], s3[8], t[8];
+  int16x4_t t_lo[8], t_hi[8];
+  int32x4_t u_lo[8], u_hi[8];
+
+  // stage 1
+  i[0] = vaddq_s16(in[0], in[15]);
+  i[1] = vaddq_s16(in[1], in[14]);
+  i[2] = vaddq_s16(in[2], in[13]);
+  i[3] = vaddq_s16(in[3], in[12]);
+  i[4] = vaddq_s16(in[4], in[11]);
+  i[5] = vaddq_s16(in[5], in[10]);
+  i[6] = vaddq_s16(in[6], in[9]);
+  i[7] = vaddq_s16(in[7], in[8]);
+
+  // pass1 variant is not accurate enough
+  vpx_fdct8x8_pass2_neon(i);
+  transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
+
+  // step 2
+  s1[0] = vsubq_s16(in[7], in[8]);
+  s1[1] = vsubq_s16(in[6], in[9]);
+  s1[2] = vsubq_s16(in[5], in[10]);
+  s1[3] = vsubq_s16(in[4], in[11]);
+  s1[4] = vsubq_s16(in[3], in[12]);
+  s1[5] = vsubq_s16(in[2], in[13]);
+  s1[6] = vsubq_s16(in[1], in[14]);
+  s1[7] = vsubq_s16(in[0], in[15]);
+
+  t[2] = vsubq_s16(s1[5], s1[2]);
+  t[3] = vsubq_s16(s1[4], s1[3]);
+  t[4] = vaddq_s16(s1[4], s1[3]);
+  t[5] = vaddq_s16(s1[5], s1[2]);
+
+  t_lo[2] = vget_low_s16(t[2]);
+  t_hi[2] = vget_high_s16(t[2]);
+  t_lo[3] = vget_low_s16(t[3]);
+  t_hi[3] = vget_high_s16(t[3]);
+  t_lo[4] = vget_low_s16(t[4]);
+  t_hi[4] = vget_high_s16(t[4]);
+  t_lo[5] = vget_low_s16(t[5]);
+  t_hi[5] = vget_high_s16(t[5]);
+
+  u_lo[2] = vmull_n_s16(t_lo[2], cospi_16_64);
+  u_hi[2] = vmull_n_s16(t_hi[2], cospi_16_64);
+  u_lo[3] = vmull_n_s16(t_lo[3], cospi_16_64);
+  u_hi[3] = vmull_n_s16(t_hi[3], cospi_16_64);
+  u_lo[4] = vmull_n_s16(t_lo[4], cospi_16_64);
+  u_hi[4] = vmull_n_s16(t_hi[4], cospi_16_64);
+  u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64);
+  u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64);
+
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+
+  s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+  s2[3] = vcombine_s16(t_lo[3], t_hi[3]);
+  s2[4] = vcombine_s16(t_lo[4], t_hi[4]);
+  s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+
+  // step 3
+  s3[0] = vaddq_s16(s1[0], s2[3]);
+  s3[1] = vaddq_s16(s1[1], s2[2]);
+  s3[2] = vsubq_s16(s1[1], s2[2]);
+  s3[3] = vsubq_s16(s1[0], s2[3]);
+  s3[4] = vsubq_s16(s1[7], s2[4]);
+  s3[5] = vsubq_s16(s1[6], s2[5]);
+  s3[6] = vaddq_s16(s1[6], s2[5]);
+  s3[7] = vaddq_s16(s1[7], s2[4]);
+
+  // step 4
+  t_lo[0] = vget_low_s16(s3[0]);
+  t_hi[0] = vget_high_s16(s3[0]);
+  t_lo[1] = vget_low_s16(s3[1]);
+  t_hi[1] = vget_high_s16(s3[1]);
+  t_lo[2] = vget_low_s16(s3[2]);
+  t_hi[2] = vget_high_s16(s3[2]);
+  t_lo[3] = vget_low_s16(s3[3]);
+  t_hi[3] = vget_high_s16(s3[3]);
+  t_lo[4] = vget_low_s16(s3[4]);
+  t_hi[4] = vget_high_s16(s3[4]);
+  t_lo[5] = vget_low_s16(s3[5]);
+  t_hi[5] = vget_high_s16(s3[5]);
+  t_lo[6] = vget_low_s16(s3[6]);
+  t_hi[6] = vget_high_s16(s3[6]);
+  t_lo[7] = vget_low_s16(s3[7]);
+  t_hi[7] = vget_high_s16(s3[7]);
+
+  // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6]
+  // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6]
+  butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6],
+                                      -cospi_8_64, cospi_24_64, &u_lo[1],
+                                      &u_hi[1], &u_lo[6], &u_hi[6]);
+
+  // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2]
+  // u[2] = cospi_8_64 * t[5]   + cospi_24_64 * t[2]
+  butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+                                      -cospi_24_64, cospi_8_64, &u_lo[5],
+                                      &u_hi[5], &u_lo[2], &u_hi[2]);
+
+  t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+
+  s2[1] = vcombine_s16(t_lo[1], t_hi[1]);
+  s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+  s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+  s2[6] = vcombine_s16(t_lo[6], t_hi[6]);
+
+  // step 5
+  s1[0] = vaddq_s16(s3[0], s2[1]);
+  s1[1] = vsubq_s16(s3[0], s2[1]);
+  s1[2] = vaddq_s16(s3[3], s2[2]);
+  s1[3] = vsubq_s16(s3[3], s2[2]);
+  s1[4] = vsubq_s16(s3[4], s2[5]);
+  s1[5] = vaddq_s16(s3[4], s2[5]);
+  s1[6] = vsubq_s16(s3[7], s2[6]);
+  s1[7] = vaddq_s16(s3[7], s2[6]);
+
+  // step 6
+  t_lo[0] = vget_low_s16(s1[0]);
+  t_hi[0] = vget_high_s16(s1[0]);
+  t_lo[1] = vget_low_s16(s1[1]);
+  t_hi[1] = vget_high_s16(s1[1]);
+  t_lo[2] = vget_low_s16(s1[2]);
+  t_hi[2] = vget_high_s16(s1[2]);
+  t_lo[3] = vget_low_s16(s1[3]);
+  t_hi[3] = vget_high_s16(s1[3]);
+  t_lo[4] = vget_low_s16(s1[4]);
+  t_hi[4] = vget_high_s16(s1[4]);
+  t_lo[5] = vget_low_s16(s1[5]);
+  t_hi[5] = vget_high_s16(s1[5]);
+  t_lo[6] = vget_low_s16(s1[6]);
+  t_hi[6] = vget_high_s16(s1[6]);
+  t_lo[7] = vget_low_s16(s1[7]);
+  t_hi[7] = vget_high_s16(s1[7]);
+
+  // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+  // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0],
+                                      cospi_2_64, cospi_30_64, &u_lo[0],
+                                      &u_hi[0], &u_lo[7], &u_hi[7]);
+
+  // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+  // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1],
+                                      cospi_18_64, cospi_14_64, &u_lo[1],
+                                      &u_hi[1], &u_lo[6], &u_hi[6]);
+
+  // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+  // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+                                      cospi_10_64, cospi_22_64, &u_lo[2],
+                                      &u_hi[2], &u_lo[5], &u_hi[5]);
+
+  // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+  // u[4] = step1[4] * cospi_6_64  - step1[3] * cospi_26_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3],
+                                      cospi_26_64, cospi_6_64, &u_lo[3],
+                                      &u_hi[3], &u_lo[4], &u_hi[4]);
+
+  // final fdct_round_shift
+  t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS);
+  t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS);
+  t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS);
+
+  in[0] = i[0];
+  in[2] = i[1];
+  in[4] = i[2];
+  in[6] = i[3];
+  in[8] = i[4];
+  in[10] = i[5];
+  in[12] = i[6];
+  in[14] = i[7];
+  in[1] = vcombine_s16(t_lo[0], t_hi[0]);
+  in[3] = vcombine_s16(t_lo[4], t_hi[4]);
+  in[5] = vcombine_s16(t_lo[2], t_hi[2]);
+  in[7] = vcombine_s16(t_lo[6], t_hi[6]);
+  in[9] = vcombine_s16(t_lo[1], t_hi[1]);
+  in[11] = vcombine_s16(t_lo[5], t_hi[5]);
+  in[13] = vcombine_s16(t_lo[3], t_hi[3]);
+  in[15] = vcombine_s16(t_lo[7], t_hi[7]);
+}
+
+static void fadst16_8col(int16x8_t *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  int16x4_t x_lo[16], x_hi[16];
+  int32x4_t s_lo[16], s_hi[16];
+  int32x4_t t_lo[16], t_hi[16];
+
+  x_lo[0] = vget_low_s16(in[15]);
+  x_hi[0] = vget_high_s16(in[15]);
+  x_lo[1] = vget_low_s16(in[0]);
+  x_hi[1] = vget_high_s16(in[0]);
+  x_lo[2] = vget_low_s16(in[13]);
+  x_hi[2] = vget_high_s16(in[13]);
+  x_lo[3] = vget_low_s16(in[2]);
+  x_hi[3] = vget_high_s16(in[2]);
+  x_lo[4] = vget_low_s16(in[11]);
+  x_hi[4] = vget_high_s16(in[11]);
+  x_lo[5] = vget_low_s16(in[4]);
+  x_hi[5] = vget_high_s16(in[4]);
+  x_lo[6] = vget_low_s16(in[9]);
+  x_hi[6] = vget_high_s16(in[9]);
+  x_lo[7] = vget_low_s16(in[6]);
+  x_hi[7] = vget_high_s16(in[6]);
+  x_lo[8] = vget_low_s16(in[7]);
+  x_hi[8] = vget_high_s16(in[7]);
+  x_lo[9] = vget_low_s16(in[8]);
+  x_hi[9] = vget_high_s16(in[8]);
+  x_lo[10] = vget_low_s16(in[5]);
+  x_hi[10] = vget_high_s16(in[5]);
+  x_lo[11] = vget_low_s16(in[10]);
+  x_hi[11] = vget_high_s16(in[10]);
+  x_lo[12] = vget_low_s16(in[3]);
+  x_hi[12] = vget_high_s16(in[3]);
+  x_lo[13] = vget_low_s16(in[12]);
+  x_hi[13] = vget_high_s16(in[12]);
+  x_lo[14] = vget_low_s16(in[1]);
+  x_hi[14] = vget_high_s16(in[1]);
+  x_lo[15] = vget_low_s16(in[14]);
+  x_hi[15] = vget_high_s16(in[14]);
+
+  // stage 1
+  // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+  // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+  butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+                                      cospi_1_64, cospi_31_64, &s_lo[0],
+                                      &s_hi[0], &s_lo[1], &s_hi[1]);
+  // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+  // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+  butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+                                      cospi_5_64, cospi_27_64, &s_lo[2],
+                                      &s_hi[2], &s_lo[3], &s_hi[3]);
+  // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+  // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+  butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+                                      cospi_9_64, cospi_23_64, &s_lo[4],
+                                      &s_hi[4], &s_lo[5], &s_hi[5]);
+  // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+  // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+  butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+                                      cospi_13_64, cospi_19_64, &s_lo[6],
+                                      &s_hi[6], &s_lo[7], &s_hi[7]);
+  // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+  // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+  butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9],
+                                      cospi_17_64, cospi_15_64, &s_lo[8],
+                                      &s_hi[8], &s_lo[9], &s_hi[9]);
+  // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+  // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+  butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11],
+                                      cospi_21_64, cospi_11_64, &s_lo[10],
+                                      &s_hi[10], &s_lo[11], &s_hi[11]);
+  // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+  // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+  butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13],
+                                      cospi_25_64, cospi_7_64, &s_lo[12],
+                                      &s_hi[12], &s_lo[13], &s_hi[13]);
+  // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+  // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+  butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15],
+                                      cospi_29_64, cospi_3_64, &s_lo[14],
+                                      &s_hi[14], &s_lo[15], &s_hi[15]);
+
+  // fdct_round_shift
+  t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+  t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+  t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+  t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+  t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+  t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+  t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+  t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
+  t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+  t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+  t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+  t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+  t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+  t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+  t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+  t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  s_lo[4] = t_lo[4];
+  s_hi[4] = t_hi[4];
+  s_lo[5] = t_lo[5];
+  s_hi[5] = t_hi[5];
+  s_lo[6] = t_lo[6];
+  s_hi[6] = t_hi[6];
+  s_lo[7] = t_lo[7];
+  s_hi[7] = t_hi[7];
+  // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9],
+                                  cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8],
+                                  &s_lo[9], &s_hi[9]);
+  // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11],
+                                  cospi_20_64, cospi_12_64, &s_lo[10],
+                                  &s_hi[10], &s_lo[11], &s_hi[11]);
+  // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12],
+                                  cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13],
+                                  &s_lo[12], &s_hi[12]);
+  // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+  butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  cospi_12_64, cospi_20_64, &s_lo[15],
+                                  &s_hi[15], &s_lo[14], &s_hi[14]);
+
+  // s0 + s4
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[4]);
+  // s1 + s5
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[5]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[5]);
+  // s2 + s6
+  t_lo[2] = vaddq_s32(s_lo[2], s_lo[6]);
+  t_hi[2] = vaddq_s32(s_hi[2], s_hi[6]);
+  // s3 + s7
+  t_lo[3] = vaddq_s32(s_lo[3], s_lo[7]);
+  t_hi[3] = vaddq_s32(s_hi[3], s_hi[7]);
+  // s0 - s4
+  t_lo[4] = vsubq_s32(s_lo[0], s_lo[4]);
+  t_hi[4] = vsubq_s32(s_hi[0], s_hi[4]);
+  // s1 - s7
+  t_lo[5] = vsubq_s32(s_lo[1], s_lo[5]);
+  t_hi[5] = vsubq_s32(s_hi[1], s_hi[5]);
+  // s2 - s6
+  t_lo[6] = vsubq_s32(s_lo[2], s_lo[6]);
+  t_hi[6] = vsubq_s32(s_hi[2], s_hi[6]);
+  // s3 - s7
+  t_lo[7] = vsubq_s32(s_lo[3], s_lo[7]);
+  t_hi[7] = vsubq_s32(s_hi[3], s_hi[7]);
+  // s8 + s12
+  t_lo[8] = vaddq_s32(s_lo[8], s_lo[12]);
+  t_hi[8] = vaddq_s32(s_hi[8], s_hi[12]);
+  // s9 + s13
+  t_lo[9] = vaddq_s32(s_lo[9], s_lo[13]);
+  t_hi[9] = vaddq_s32(s_hi[9], s_hi[13]);
+  // s10 + s14
+  t_lo[10] = vaddq_s32(s_lo[10], s_lo[14]);
+  t_hi[10] = vaddq_s32(s_hi[10], s_hi[14]);
+  // s11 + s15
+  t_lo[11] = vaddq_s32(s_lo[11], s_lo[15]);
+  t_hi[11] = vaddq_s32(s_hi[11], s_hi[15]);
+  // s8 + s12
+  t_lo[12] = vsubq_s32(s_lo[8], s_lo[12]);
+  t_hi[12] = vsubq_s32(s_hi[8], s_hi[12]);
+  // s9 + s13
+  t_lo[13] = vsubq_s32(s_lo[9], s_lo[13]);
+  t_hi[13] = vsubq_s32(s_hi[9], s_hi[13]);
+  // s10 + s14
+  t_lo[14] = vsubq_s32(s_lo[10], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[10], s_hi[14]);
+  // s11 + s15
+  t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]);
+
+  t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+  t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+  t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+  t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+  t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+  t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+  t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+  t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 3
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+                                  cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+                                  &s_lo[5], &s_hi[5]);
+  // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                  cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7],
+                                  &s_lo[6], &s_hi[6]);
+  s_lo[8] = t_lo[8];
+  s_hi[8] = t_hi[8];
+  s_lo[9] = t_lo[9];
+  s_hi[9] = t_hi[9];
+  s_lo[10] = t_lo[10];
+  s_hi[10] = t_hi[10];
+  s_lo[11] = t_lo[11];
+  s_hi[11] = t_hi[11];
+  // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13],
+                                  cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12],
+                                  &s_lo[13], &s_hi[13]);
+  // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+  butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15],
+                                  &s_lo[14], &s_hi[14]);
+
+  // s0 + s4
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
+  // s0 - s4
+  t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+  t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+  t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
+  // s4 + s6
+  t_lo[4] = vaddq_s32(s_lo[4], s_lo[6]);
+  t_hi[4] = vaddq_s32(s_hi[4], s_hi[6]);
+  // s5 + s7
+  t_lo[5] = vaddq_s32(s_lo[5], s_lo[7]);
+  t_hi[5] = vaddq_s32(s_hi[5], s_hi[7]);
+  // s4 - s6
+  t_lo[6] = vsubq_s32(s_lo[4], s_lo[6]);
+  t_hi[6] = vsubq_s32(s_hi[4], s_hi[6]);
+  // s5 - s7
+  t_lo[7] = vsubq_s32(s_lo[5], s_lo[7]);
+  t_hi[7] = vsubq_s32(s_hi[5], s_hi[7]);
+  // s8 + s10
+  t_lo[8] = vaddq_s32(s_lo[8], s_lo[10]);
+  t_hi[8] = vaddq_s32(s_hi[8], s_hi[10]);
+  // s9 + s11
+  t_lo[9] = vaddq_s32(s_lo[9], s_lo[11]);
+  t_hi[9] = vaddq_s32(s_hi[9], s_hi[11]);
+  // s8 - s10
+  t_lo[10] = vsubq_s32(s_lo[8], s_lo[10]);
+  t_hi[10] = vsubq_s32(s_hi[8], s_hi[10]);
+  // s9 - s11
+  t_lo[11] = vsubq_s32(s_lo[9], s_lo[11]);
+  t_hi[11] = vsubq_s32(s_hi[9], s_hi[11]);
+  // s12 + s14
+  t_lo[12] = vaddq_s32(s_lo[12], s_lo[14]);
+  t_hi[12] = vaddq_s32(s_hi[12], s_hi[14]);
+  // s13 + s15
+  t_lo[13] = vaddq_s32(s_lo[13], s_lo[15]);
+  t_hi[13] = vaddq_s32(s_hi[13], s_hi[15]);
+  // s12 - s14
+  t_lo[14] = vsubq_s32(s_lo[12], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[12], s_hi[14]);
+  // s13 - s15
+  t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]);
+
+  t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 4
+  // s2 = (-cospi_16_64) * (x2 + x3);
+  // s3 = cospi_16_64 * (x2 - x3);
+  butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+                                  -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+                                  &s_hi[3]);
+  // s6 = cospi_16_64 * (x6 + x7);
+  // s7 = cospi_16_64 * (-x6 + x7);
+  butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                  cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+                                  &s_hi[7]);
+  // s10 = cospi_16_64 * (x10 + x11);
+  // s11 = cospi_16_64 * (-x10 + x11);
+  butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+                                  cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11],
+                                  &s_hi[11]);
+  // s14 = (-cospi_16_64) * (x14 + x15);
+  // s15 = cospi_16_64 * (x14 - x15);
+  butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15],
+                                  &s_hi[15]);
+
+  // final fdct_round_shift
+  x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
+  x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS);
+  x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS);
+  x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS);
+  x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS);
+  x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS);
+  x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS);
+  x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS);
+  x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS);
+
+  // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly
+  x_lo[0] = vmovn_s32(t_lo[0]);
+  x_hi[0] = vmovn_s32(t_hi[0]);
+  x_lo[1] = vmovn_s32(t_lo[1]);
+  x_hi[1] = vmovn_s32(t_hi[1]);
+  x_lo[4] = vmovn_s32(t_lo[4]);
+  x_hi[4] = vmovn_s32(t_hi[4]);
+  x_lo[5] = vmovn_s32(t_lo[5]);
+  x_hi[5] = vmovn_s32(t_hi[5]);
+  x_lo[8] = vmovn_s32(t_lo[8]);
+  x_hi[8] = vmovn_s32(t_hi[8]);
+  x_lo[9] = vmovn_s32(t_lo[9]);
+  x_hi[9] = vmovn_s32(t_hi[9]);
+  x_lo[12] = vmovn_s32(t_lo[12]);
+  x_hi[12] = vmovn_s32(t_hi[12]);
+  x_lo[13] = vmovn_s32(t_lo[13]);
+  x_hi[13] = vmovn_s32(t_hi[13]);
+
+  in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+  in[1] = vnegq_s16(vcombine_s16(x_lo[8], x_hi[8]));
+  in[2] = vcombine_s16(x_lo[12], x_hi[12]);
+  in[3] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+  in[4] = vcombine_s16(x_lo[6], x_hi[6]);
+  in[5] = vcombine_s16(x_lo[14], x_hi[14]);
+  in[6] = vcombine_s16(x_lo[10], x_hi[10]);
+  in[7] = vcombine_s16(x_lo[2], x_hi[2]);
+  in[8] = vcombine_s16(x_lo[3], x_hi[3]);
+  in[9] = vcombine_s16(x_lo[11], x_hi[11]);
+  in[10] = vcombine_s16(x_lo[15], x_hi[15]);
+  in[11] = vcombine_s16(x_lo[7], x_hi[7]);
+  in[12] = vcombine_s16(x_lo[5], x_hi[5]);
+  in[13] = vnegq_s16(vcombine_s16(x_lo[13], x_hi[13]));
+  in[14] = vcombine_s16(x_lo[9], x_hi[9]);
+  in[15] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
+}
+
+static void fdct16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+  // Left half.
+  fdct16_8col(in0);
+  // Right half.
+  fdct16_8col(in1);
+  transpose_s16_16x16(in0, in1);
+}
+
+static void fadst16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
+  transpose_s16_16x16(in0, in1);
+}
+
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  int16x8_t in0[16], in1[16];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct16x16_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fdct16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_load_buffer_4x4(const int16_t *input,
+                                          int32x4_t *in /*[4]*/, int stride) {
+  // { 0, 1, 1, 1 };
+  const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3);
+  // { 1, 0, 0, 0 };
+  const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3);
+  int32x4_t mask;
+
+  in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+  // one non-zero first elements
+  mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a));
+  in[0] = vaddq_s32(in[0], mask);
+  in[0] = vaddq_s32(in[0], nonzero_bias_b);
+}
+
+static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) {
+  const int32x4_t one = vdupq_n_s32(1);
+  res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2);
+  res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2);
+  res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2);
+  res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2);
+  vst1q_s32(output + 0 * 4, res[0]);
+  vst1q_s32(output + 1 * 4, res[1]);
+  vst1q_s32(output + 2 * 4, res[2]);
+  vst1q_s32(output + 3 * 4, res[3]);
+}
+
+static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) {
+  int32x2_t s_lo[4], s_hi[4];
+  int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4];
+
+  s_lo[0] = vget_low_s32(in[0]);
+  s_hi[0] = vget_high_s32(in[0]);
+  s_lo[1] = vget_low_s32(in[1]);
+  s_hi[1] = vget_high_s32(in[1]);
+  s_lo[2] = vget_low_s32(in[2]);
+  s_hi[2] = vget_high_s32(in[2]);
+  s_lo[3] = vget_low_s32(in[3]);
+  s_hi[3] = vget_high_s32(in[3]);
+
+  // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+  t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9);
+  t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9);
+  t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9);
+  t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9);
+  t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9);
+  t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9);
+
+  // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+  t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9);
+  t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9);
+  t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9);
+  t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9);
+  t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9);
+  t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9);
+
+  // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+  t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9);
+  t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9);
+  t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9);
+  t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9);
+  t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9);
+  t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9);
+
+  // t3 = s2 * sinpi_3_9
+  t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9);
+  t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9);
+
+  /*
+   * u0 = t0 + t3
+   * u1 = t1
+   * u2 = t2 - t3
+   * u3 = t2 - t0 + t3
+   */
+  u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]);
+  u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]);
+  u_lo[1] = t_lo[1];
+  u_hi[1] = t_hi[1];
+  u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]);
+  u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]);
+  u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]);
+  u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]);
+
+  // fdct_round_shift
+  in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[0], DCT_CONST_BITS));
+  in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[1], DCT_CONST_BITS));
+  in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[2], DCT_CONST_BITS));
+  in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[3], DCT_CONST_BITS));
+
+  transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+}
+
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  int32x4_t in[4];
+  // int i;
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_4x4(input, in, stride);
+      highbd_fadst4x4_neon(in);
+      vpx_highbd_fdct4x4_pass1_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_4x4(input, in, stride);
+      vpx_highbd_fdct4x4_pass1_neon(in);
+      highbd_fadst4x4_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_4x4(input, in, stride);
+      highbd_fadst4x4_neon(in);
+      highbd_fadst4x4_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+  }
+}
+
+static INLINE void highbd_load_buffer_8x8(const int16_t *input,
+                                          int32x4_t *lo /*[8]*/,
+                                          int32x4_t *hi /*[8]*/, int stride) {
+  int16x8_t in[8];
+  in[0] = vld1q_s16(input + 0 * stride);
+  in[1] = vld1q_s16(input + 1 * stride);
+  in[2] = vld1q_s16(input + 2 * stride);
+  in[3] = vld1q_s16(input + 3 * stride);
+  in[4] = vld1q_s16(input + 4 * stride);
+  in[5] = vld1q_s16(input + 5 * stride);
+  in[6] = vld1q_s16(input + 6 * stride);
+  in[7] = vld1q_s16(input + 7 * stride);
+  lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+  hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+  lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+  hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+  lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+  hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+  lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+  hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+  lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+  hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+  lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+  hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+  lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+  hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+  lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+  hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi,
+                                          const int bit) {
+  int32x4_t sign_lo[8], sign_hi[8];
+  sign_lo[0] = vshrq_n_s32(lo[0], 31);
+  sign_hi[0] = vshrq_n_s32(hi[0], 31);
+  sign_lo[1] = vshrq_n_s32(lo[1], 31);
+  sign_hi[1] = vshrq_n_s32(hi[1], 31);
+  sign_lo[2] = vshrq_n_s32(lo[2], 31);
+  sign_hi[2] = vshrq_n_s32(hi[2], 31);
+  sign_lo[3] = vshrq_n_s32(lo[3], 31);
+  sign_hi[3] = vshrq_n_s32(hi[3], 31);
+  sign_lo[4] = vshrq_n_s32(lo[4], 31);
+  sign_hi[4] = vshrq_n_s32(hi[4], 31);
+  sign_lo[5] = vshrq_n_s32(lo[5], 31);
+  sign_hi[5] = vshrq_n_s32(hi[5], 31);
+  sign_lo[6] = vshrq_n_s32(lo[6], 31);
+  sign_hi[6] = vshrq_n_s32(hi[6], 31);
+  sign_lo[7] = vshrq_n_s32(lo[7], 31);
+  sign_hi[7] = vshrq_n_s32(hi[7], 31);
+
+  if (bit == 2) {
+    const int32x4_t const_rounding = vdupq_n_s32(1);
+    lo[0] = vaddq_s32(lo[0], const_rounding);
+    hi[0] = vaddq_s32(hi[0], const_rounding);
+    lo[1] = vaddq_s32(lo[1], const_rounding);
+    hi[1] = vaddq_s32(hi[1], const_rounding);
+    lo[2] = vaddq_s32(lo[2], const_rounding);
+    hi[2] = vaddq_s32(hi[2], const_rounding);
+    lo[3] = vaddq_s32(lo[3], const_rounding);
+    hi[3] = vaddq_s32(hi[3], const_rounding);
+    lo[4] = vaddq_s32(lo[4], const_rounding);
+    hi[4] = vaddq_s32(hi[4], const_rounding);
+    lo[5] = vaddq_s32(lo[5], const_rounding);
+    hi[5] = vaddq_s32(hi[5], const_rounding);
+    lo[6] = vaddq_s32(lo[6], const_rounding);
+    hi[6] = vaddq_s32(hi[6], const_rounding);
+    lo[7] = vaddq_s32(lo[7], const_rounding);
+    hi[7] = vaddq_s32(hi[7], const_rounding);
+  }
+
+  lo[0] = vsubq_s32(lo[0], sign_lo[0]);
+  hi[0] = vsubq_s32(hi[0], sign_hi[0]);
+  lo[1] = vsubq_s32(lo[1], sign_lo[1]);
+  hi[1] = vsubq_s32(hi[1], sign_hi[1]);
+  lo[2] = vsubq_s32(lo[2], sign_lo[2]);
+  hi[2] = vsubq_s32(hi[2], sign_hi[2]);
+  lo[3] = vsubq_s32(lo[3], sign_lo[3]);
+  hi[3] = vsubq_s32(hi[3], sign_hi[3]);
+  lo[4] = vsubq_s32(lo[4], sign_lo[4]);
+  hi[4] = vsubq_s32(hi[4], sign_hi[4]);
+  lo[5] = vsubq_s32(lo[5], sign_lo[5]);
+  hi[5] = vsubq_s32(hi[5], sign_hi[5]);
+  lo[6] = vsubq_s32(lo[6], sign_lo[6]);
+  hi[6] = vsubq_s32(hi[6], sign_hi[6]);
+  lo[7] = vsubq_s32(lo[7], sign_lo[7]);
+  hi[7] = vsubq_s32(hi[7], sign_hi[7]);
+
+  if (bit == 1) {
+    lo[0] = vshrq_n_s32(lo[0], 1);
+    hi[0] = vshrq_n_s32(hi[0], 1);
+    lo[1] = vshrq_n_s32(lo[1], 1);
+    hi[1] = vshrq_n_s32(hi[1], 1);
+    lo[2] = vshrq_n_s32(lo[2], 1);
+    hi[2] = vshrq_n_s32(hi[2], 1);
+    lo[3] = vshrq_n_s32(lo[3], 1);
+    hi[3] = vshrq_n_s32(hi[3], 1);
+    lo[4] = vshrq_n_s32(lo[4], 1);
+    hi[4] = vshrq_n_s32(hi[4], 1);
+    lo[5] = vshrq_n_s32(lo[5], 1);
+    hi[5] = vshrq_n_s32(hi[5], 1);
+    lo[6] = vshrq_n_s32(lo[6], 1);
+    hi[6] = vshrq_n_s32(hi[6], 1);
+    lo[7] = vshrq_n_s32(lo[7], 1);
+    hi[7] = vshrq_n_s32(hi[7], 1);
+  } else {
+    lo[0] = vshrq_n_s32(lo[0], 2);
+    hi[0] = vshrq_n_s32(hi[0], 2);
+    lo[1] = vshrq_n_s32(lo[1], 2);
+    hi[1] = vshrq_n_s32(hi[1], 2);
+    lo[2] = vshrq_n_s32(lo[2], 2);
+    hi[2] = vshrq_n_s32(hi[2], 2);
+    lo[3] = vshrq_n_s32(lo[3], 2);
+    hi[3] = vshrq_n_s32(hi[3], 2);
+    lo[4] = vshrq_n_s32(lo[4], 2);
+    hi[4] = vshrq_n_s32(hi[4], 2);
+    lo[5] = vshrq_n_s32(lo[5], 2);
+    hi[5] = vshrq_n_s32(hi[5], 2);
+    lo[6] = vshrq_n_s32(lo[6], 2);
+    hi[6] = vshrq_n_s32(hi[6], 2);
+    lo[7] = vshrq_n_s32(lo[7], 2);
+    hi[7] = vshrq_n_s32(hi[7], 2);
+  }
+}
+
+static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo,
+                                           int32x4_t *hi, int stride) {
+  vst1q_s32(output + 0 * stride, lo[0]);
+  vst1q_s32(output + 0 * stride + 4, hi[0]);
+  vst1q_s32(output + 1 * stride, lo[1]);
+  vst1q_s32(output + 1 * stride + 4, hi[1]);
+  vst1q_s32(output + 2 * stride, lo[2]);
+  vst1q_s32(output + 2 * stride + 4, hi[2]);
+  vst1q_s32(output + 3 * stride, lo[3]);
+  vst1q_s32(output + 3 * stride + 4, hi[3]);
+  vst1q_s32(output + 4 * stride, lo[4]);
+  vst1q_s32(output + 4 * stride + 4, hi[4]);
+  vst1q_s32(output + 5 * stride, lo[5]);
+  vst1q_s32(output + 5 * stride + 4, hi[5]);
+  vst1q_s32(output + 6 * stride, lo[6]);
+  vst1q_s32(output + 6 * stride + 4, hi[6]);
+  vst1q_s32(output + 7 * stride, lo[7]);
+  vst1q_s32(output + 7 * stride + 4, hi[7]);
+}
+
+static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/,
+                                        int32x4_t *hi /*[8]*/) {
+  int32x4_t s_lo[8], s_hi[8];
+  int32x4_t t_lo[8], t_hi[8];
+  int32x4_t x_lo[8], x_hi[8];
+  int64x2_t s64_lo[16], s64_hi[16];
+
+  x_lo[0] = lo[7];
+  x_hi[0] = hi[7];
+  x_lo[1] = lo[0];
+  x_hi[1] = hi[0];
+  x_lo[2] = lo[5];
+  x_hi[2] = hi[5];
+  x_lo[3] = lo[2];
+  x_hi[3] = hi[2];
+  x_lo[4] = lo[3];
+  x_hi[4] = hi[3];
+  x_lo[5] = lo[4];
+  x_hi[5] = hi[4];
+  x_lo[6] = lo[1];
+  x_hi[6] = hi[1];
+  x_lo[7] = lo[6];
+  x_hi[7] = hi[6];
+
+  // stage 1
+  // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64,
+      &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+  // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64,
+      &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+
+  // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+  // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+  // fdct_round_shift, indices are doubled
+  t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+  t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+  t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+  t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+  t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+  t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+  t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+  t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+  t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+  t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+  t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+  t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+  // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+  // fdct_round_shift
+  // s0 + s2
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s0 - s2
+  t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+
+  // s1 + s3
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // s1 - s3
+  t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+
+  // s4 + s6
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s4 - s6
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+
+  // s5 + s7
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s5 - s7
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+
+  // stage 3
+  // s2 = cospi_16_64 * (x2 + x3)
+  // s3 = cospi_16_64 * (x2 - x3)
+  butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64,
+                               &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]);
+
+  // s6 = cospi_16_64 * (x6 + x7)
+  // s7 = cospi_16_64 * (x6 - x7)
+  butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64,
+                               &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]);
+
+  // x0, x2, x4, x6 pass through
+  lo[0] = t_lo[0];
+  hi[0] = t_hi[0];
+  lo[2] = s_lo[6];
+  hi[2] = s_hi[6];
+  lo[4] = s_lo[3];
+  hi[4] = s_hi[3];
+  lo[6] = t_lo[5];
+  hi[6] = t_hi[5];
+
+  lo[1] = vnegq_s32(t_lo[4]);
+  hi[1] = vnegq_s32(t_hi[4]);
+  lo[3] = vnegq_s32(s_lo[2]);
+  hi[3] = vnegq_s32(s_hi[2]);
+  lo[5] = vnegq_s32(s_lo[7]);
+  hi[5] = vnegq_s32(s_hi[7]);
+  lo[7] = vnegq_s32(t_lo[1]);
+  hi[7] = vnegq_s32(t_hi[1]);
+
+  transpose_s32_8x8_2(lo, hi, lo, hi);
+}
+
+void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  int32x4_t lo[8], hi[8];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      highbd_fadst8x8_neon(lo, hi);
+      // pass1 variant is not precise enough
+      vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      // pass1 variant is not precise enough
+      vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+  }
+}
+
+static INLINE void highbd_load_buffer_16x16(
+    const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+    int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+  // load first 8 columns
+  highbd_load_buffer_8x8(input, left1, right1, stride);
+  highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  highbd_load_buffer_8x8(input, left2, right2, stride);
+  highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_write_buffer_16x16(
+    tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+    int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+  // write first 8 columns
+  highbd_write_buffer_8x8(output, left1, right1, stride);
+  highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride);
+
+  // write second 8 columns
+  output += 8;
+  highbd_write_buffer_8x8(output, left2, right2, stride);
+  highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/,
+                                            int32x4_t *right1 /*[16]*/,
+                                            int32x4_t *left2 /*[16]*/,
+                                            int32x4_t *right2 /*[16]*/,
+                                            const int bit) {
+  // perform rounding operations
+  highbd_right_shift_8x8(left1, right1, bit);
+  highbd_right_shift_8x8(left1 + 8, right1 + 8, bit);
+  highbd_right_shift_8x8(left2, right2, bit);
+  highbd_right_shift_8x8(left2 + 8, right2 + 8, bit);
+}
+
+static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) {
+  // perform 16x16 1-D DCT for 8 columns
+  int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8];
+  int32x4_t left8[8], right8[8];
+
+  // stage 1
+  left8[0] = vaddq_s32(left[0], left[15]);
+  right8[0] = vaddq_s32(right[0], right[15]);
+  left8[1] = vaddq_s32(left[1], left[14]);
+  right8[1] = vaddq_s32(right[1], right[14]);
+  left8[2] = vaddq_s32(left[2], left[13]);
+  right8[2] = vaddq_s32(right[2], right[13]);
+  left8[3] = vaddq_s32(left[3], left[12]);
+  right8[3] = vaddq_s32(right[3], right[12]);
+  left8[4] = vaddq_s32(left[4], left[11]);
+  right8[4] = vaddq_s32(right[4], right[11]);
+  left8[5] = vaddq_s32(left[5], left[10]);
+  right8[5] = vaddq_s32(right[5], right[10]);
+  left8[6] = vaddq_s32(left[6], left[9]);
+  right8[6] = vaddq_s32(right[6], right[9]);
+  left8[7] = vaddq_s32(left[7], left[8]);
+  right8[7] = vaddq_s32(right[7], right[8]);
+
+  // step 1
+  s1_lo[0] = vsubq_s32(left[7], left[8]);
+  s1_hi[0] = vsubq_s32(right[7], right[8]);
+  s1_lo[1] = vsubq_s32(left[6], left[9]);
+  s1_hi[1] = vsubq_s32(right[6], right[9]);
+  s1_lo[2] = vsubq_s32(left[5], left[10]);
+  s1_hi[2] = vsubq_s32(right[5], right[10]);
+  s1_lo[3] = vsubq_s32(left[4], left[11]);
+  s1_hi[3] = vsubq_s32(right[4], right[11]);
+  s1_lo[4] = vsubq_s32(left[3], left[12]);
+  s1_hi[4] = vsubq_s32(right[3], right[12]);
+  s1_lo[5] = vsubq_s32(left[2], left[13]);
+  s1_hi[5] = vsubq_s32(right[2], right[13]);
+  s1_lo[6] = vsubq_s32(left[1], left[14]);
+  s1_hi[6] = vsubq_s32(right[1], right[14]);
+  s1_lo[7] = vsubq_s32(left[0], left[15]);
+  s1_hi[7] = vsubq_s32(right[0], right[15]);
+
+  // pass1 variant is not accurate enough
+  vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8);
+
+  // step 2
+  // step2[2] = (step1[5] - step1[2]) * cospi_16_64;
+  // step2[5] = (step1[5] + step1[2]) * cospi_16_64;
+  butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+                                     cospi_16_64, &s2_lo[5], &s2_hi[5],
+                                     &s2_lo[2], &s2_hi[2]);
+  // step2[3] = (step1[4] - step1[3]) * cospi_16_64;
+  // step2[4] = (step1[4] + step1[3]) * cospi_16_64;
+  butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+                                     cospi_16_64, &s2_lo[4], &s2_hi[4],
+                                     &s2_lo[3], &s2_hi[3]);
+
+  // step 3
+  s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]);
+  s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]);
+  s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]);
+  s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]);
+  s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]);
+  s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]);
+  s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]);
+  s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]);
+  s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]);
+  s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]);
+  s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]);
+  s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]);
+  s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]);
+  s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]);
+  s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]);
+  s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]);
+
+  // step 4
+  // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1]
+  // s2[6] = cospi_8_64 * s3[6]  + cospi_24_64 * s3[1]
+  butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1],
+                                     cospi_8_64, cospi_24_64, &s2_lo[6],
+                                     &s2_hi[6], &s2_lo[1], &s2_hi[1]);
+
+  // s2[5] =  cospi_8_64 * s3[2] - cospi_24_64 * s3[5]
+  // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5]
+  butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5],
+                                     cospi_24_64, cospi_8_64, &s2_lo[2],
+                                     &s2_hi[2], &s2_lo[5], &s2_hi[5]);
+
+  // step 5
+  s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]);
+  s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]);
+  s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]);
+  s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]);
+  s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]);
+  s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]);
+  s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]);
+  s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]);
+  s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]);
+  s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]);
+  s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]);
+  s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]);
+  s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]);
+  s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]);
+  s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]);
+  s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]);
+
+  // step 6
+  // out[1]  = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+  // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+
+  // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+  // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+
+  // out[5]  = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+  // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
+
+  // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+  // out[3]  = step1[4] * cospi_6_64  - step1[3] * cospi_26_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+
+  left[0] = left8[0];
+  right[0] = right8[0];
+  left[2] = left8[1];
+  right[2] = right8[1];
+  left[4] = left8[2];
+  right[4] = right8[2];
+  left[6] = left8[3];
+  right[6] = right8[3];
+  left[8] = left8[4];
+  right[8] = right8[4];
+  left[10] = left8[5];
+  right[10] = right8[5];
+  left[12] = left8[6];
+  right[12] = right8[6];
+  left[14] = left8[7];
+  right[14] = right8[7];
+}
+
+static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) {
+  // perform 16x16 1-D ADST for 8 columns
+  int32x4_t x_lo[16], x_hi[16];
+  int32x4_t s_lo[16], s_hi[16];
+  int32x4_t t_lo[16], t_hi[16];
+  int64x2_t s64_lo[32], s64_hi[32];
+
+  x_lo[0] = left[15];
+  x_hi[0] = right[15];
+  x_lo[1] = left[0];
+  x_hi[1] = right[0];
+  x_lo[2] = left[13];
+  x_hi[2] = right[13];
+  x_lo[3] = left[2];
+  x_hi[3] = right[2];
+  x_lo[4] = left[11];
+  x_hi[4] = right[11];
+  x_lo[5] = left[4];
+  x_hi[5] = right[4];
+  x_lo[6] = left[9];
+  x_hi[6] = right[9];
+  x_lo[7] = left[6];
+  x_hi[7] = right[6];
+  x_lo[8] = left[7];
+  x_hi[8] = right[7];
+  x_lo[9] = left[8];
+  x_hi[9] = right[8];
+  x_lo[10] = left[5];
+  x_hi[10] = right[5];
+  x_lo[11] = left[10];
+  x_hi[11] = right[10];
+  x_lo[12] = left[3];
+  x_hi[12] = right[3];
+  x_lo[13] = left[12];
+  x_hi[13] = right[12];
+  x_lo[14] = left[1];
+  x_hi[14] = right[1];
+  x_lo[15] = left[14];
+  x_hi[15] = right[14];
+
+  // stage 1, indices are doubled
+  // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+  // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64,
+      &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+  // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+  // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64,
+      &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+  // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+  // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+  // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+  // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+  // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+  // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64,
+      &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+  // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+  // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64,
+      &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+  // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+  // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64,
+      &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+  // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+  // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64,
+      &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]);
+
+  // fdct_round_shift, indices are doubled
+  t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+  t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+  t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+  t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+  t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+  t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+  t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+  t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+  t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+  t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+  t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+  t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+  t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+  t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+  t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+  t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+  t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+  t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+  t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+  t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+  t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+  t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+  t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+  t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  s_lo[4] = t_lo[4];
+  s_hi[4] = t_hi[4];
+  s_lo[5] = t_lo[5];
+  s_hi[5] = t_hi[5];
+  s_lo[6] = t_lo[6];
+  s_hi[6] = t_hi[6];
+  s_lo[7] = t_lo[7];
+  s_hi[7] = t_hi[7];
+  // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64,
+      &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+  // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64,
+      &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+  // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64,
+      &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]);
+  // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64,
+      &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+  // s0 + s4
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]);
+  // s1 + s5
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]);
+  // s2 + s6
+  t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]);
+  t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]);
+  // s3 + s7
+  t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]);
+  t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+  // s0 - s4
+  t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]);
+  t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]);
+  // s1 - s5
+  t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]);
+  t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]);
+  // s2 - s6
+  t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]);
+  t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]);
+  // s3 - s7
+  t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]);
+  t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+  // fdct_round_shift()
+  // s8 + s12
+  t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+  t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+  // s9 + s13
+  t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+  t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+  // s10 + s14
+  t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+  t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+  // s11 + s15
+  t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+  t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+  // s8 - s12
+  t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+  t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+  // s9 - s13
+  t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+  t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+  // s10 - s14
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+  // s11 - s15
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+  // stage 3
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+  // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]);
+  s_lo[8] = t_lo[8];
+  s_hi[8] = t_hi[8];
+  s_lo[9] = t_lo[9];
+  s_hi[9] = t_hi[9];
+  s_lo[10] = t_lo[10];
+  s_hi[10] = t_hi[10];
+  s_lo[11] = t_lo[11];
+  s_hi[11] = t_hi[11];
+  // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+  // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+  // s0 + s2
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // s0 - s2
+  t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // fdct_round_shift()
+  // s4 + s6
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s5 + s7
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s4 - s6
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s5 - s7
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s8 + s10
+  t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]);
+  t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]);
+  // s9 + s11
+  t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]);
+  t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]);
+  // s8 - s10
+  t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]);
+  t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]);
+  // s9 - s11
+  t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]);
+  t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]);
+  // fdct_round_shift()
+  // s12 + s14
+  t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+  t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+  // s13 + s15
+  t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+  t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+  // s12 - s14
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+  // s13 - s15
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+
+  // stage 4, with fdct_round_shift
+  // s2 = (-cospi_16_64) * (x2 + x3);
+  // s3 = cospi_16_64 * (x2 - x3);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+                                     -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3],
+                                     &x_hi[3]);
+  // s6 = cospi_16_64 * (x6 + x7);
+  // s7 = cospi_16_64 * (-x6 + x7);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                     cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7],
+                                     &x_hi[7]);
+  // s10 = cospi_16_64 * (x10 + x11);
+  // s11 = cospi_16_64 * (-x10 + x11);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+                                     cospi_16_64, &x_lo[10], &x_hi[10],
+                                     &x_lo[11], &x_hi[11]);
+  // s14 = (-cospi_16_64) * (x14 + x15);
+  // s15 = cospi_16_64 * (x14 - x15);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                     -cospi_16_64, &x_lo[14], &x_hi[14],
+                                     &x_lo[15], &x_hi[15]);
+
+  // Just copy x0, x1, x4, x5, x8, x9, x12, x13
+  x_lo[0] = t_lo[0];
+  x_hi[0] = t_hi[0];
+  x_lo[1] = t_lo[1];
+  x_hi[1] = t_hi[1];
+  x_lo[4] = t_lo[4];
+  x_hi[4] = t_hi[4];
+  x_lo[5] = t_lo[5];
+  x_hi[5] = t_hi[5];
+  x_lo[8] = t_lo[8];
+  x_hi[8] = t_hi[8];
+  x_lo[9] = t_lo[9];
+  x_hi[9] = t_hi[9];
+  x_lo[12] = t_lo[12];
+  x_hi[12] = t_hi[12];
+  x_lo[13] = t_lo[13];
+  x_hi[13] = t_hi[13];
+
+  left[0] = x_lo[0];
+  right[0] = x_hi[0];
+  left[1] = vnegq_s32(x_lo[8]);
+  right[1] = vnegq_s32(x_hi[8]);
+  left[2] = x_lo[12];
+  right[2] = x_hi[12];
+  left[3] = vnegq_s32(x_lo[4]);
+  right[3] = vnegq_s32(x_hi[4]);
+  left[4] = x_lo[6];
+  right[4] = x_hi[6];
+  left[5] = x_lo[14];
+  right[5] = x_hi[14];
+  left[6] = x_lo[10];
+  right[6] = x_hi[10];
+  left[7] = x_lo[2];
+  right[7] = x_hi[2];
+  left[8] = x_lo[3];
+  right[8] = x_hi[3];
+  left[9] = x_lo[11];
+  right[9] = x_hi[11];
+  left[10] = x_lo[15];
+  right[10] = x_hi[15];
+  left[11] = x_lo[7];
+  right[11] = x_hi[7];
+  left[12] = x_lo[5];
+  right[12] = x_hi[5];
+  left[13] = vnegq_s32(x_lo[13]);
+  right[13] = vnegq_s32(x_hi[13]);
+  left[14] = x_lo[9];
+  right[14] = x_hi[9];
+  left[15] = vnegq_s32(x_lo[1]);
+  right[15] = vnegq_s32(x_hi[1]);
+}
+
+static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1,
+                                  int32x4_t *left2, int32x4_t *right2) {
+  // Left half.
+  highbd_fdct16_8col(left1, right1);
+  // Right half.
+  highbd_fdct16_8col(left2, right2);
+  transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1,
+                                   int32x4_t *left2, int32x4_t *right2) {
+  // Left half.
+  highbd_fadst16_8col(left1, right1);
+  // Right half.
+  highbd_fadst16_8col(left2, right2);
+  transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output,
+                              int stride, int tx_type) {
+  int32x4_t left1[16], right1[16], left2[16], right2[16];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fdct16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fdct16x16_neon(left1, right1, left2, right2);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
index 4152e7bb5d..d631cd437d 100644
--- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c
+++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -21,6 +21,9 @@
 
 // Compute the sum of all pixel differences of this MB.
 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_s8(v_sum_diff_total);
+#else
   const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
   const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
   const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
@@ -28,6 +31,7 @@ static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
                                 vget_low_s64(fedcba98_76543210));
   const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
   return sum_diff;
+#endif
 }
 
 // Denoise a 16x1 vector.
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
new file mode 100644
index 0000000000..b82b3f9db5
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -0,0 +1,296 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+  int_mv result;
+  result.as_mv.row = row;
+  result.as_mv.col = col;
+  return result;
+}
+
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables,   *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
+ * vp9_encoder.c.                                                            *
+ * For the joint cost:                                                       *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
+ * For the component costs:                                                  *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
+ *         (Equal costs for both components)                                 *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
+ *         (Cost function is even)                                           *
+ * If these do not hold, then this function cannot be used without           *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties.                                  *
+ *****************************************************************************/
+int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
+                                const search_site_config *cfg, MV *ref_mv,
+                                uint32_t start_mv_sad, MV *best_mv,
+                                int search_param, int sad_per_bit, int *num00,
+                                const vp9_sad_fn_ptr_t *sad_fn_ptr,
+                                const MV *center_mv) {
+  static const uint32_t data[4] = { 0, 1, 2, 3 };
+  const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
+
+  const int32x4_t zero_s32 = vdupq_n_s32(0);
+  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
+  const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int));
+  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
+  const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int));
+
+  const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit);
+
+  const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]);
+  const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const int_mv fcenter_mv =
+      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
+  const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
+
+  const int ref_row = ref_mv->row;
+  const int ref_col = ref_mv->col;
+
+  int_mv bmv = pack_int_mv(ref_row, ref_col);
+  int_mv new_bmv = bmv;
+  int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const uint8_t *const in_what =
+      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+
+  // Work out the start point for the search
+  const uint8_t *best_address = in_what;
+  const uint8_t *new_best_address = best_address;
+#if VPX_ARCH_AARCH64
+  int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+  int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+  // Starting position
+  unsigned int best_sad = start_mv_sad;
+  int i, j, step;
+
+  // Check the prerequisite cost function properties that are easy to check
+  // in an assert. See the function-level documentation for details on all
+  // prerequisites.
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+  *num00 = 0;
+
+  for (i = 0, step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+      int16x8_t v_diff_mv_w;
+      int8x16_t v_inside_d;
+      uint32x4_t v_outside_d;
+      int32x4_t v_cost_d, v_sad_d;
+#if VPX_ARCH_AARCH64
+      int64x2_t v_blocka[2];
+#else
+      int32x4_t v_blocka[1];
+      uint32x2_t horiz_max_0, horiz_max_1;
+#endif
+
+      uint32_t horiz_max;
+      // Compute the candidate motion vectors
+      const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]);
+      const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w);
+      // Clamp them to the search bounds
+      int16x8_t v_these_mv_clamp_w = v_these_mv_w;
+      v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w);
+      v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w);
+      // The ones that did not change are inside the search area
+      v_inside_d = vreinterpretq_s8_u32(
+          vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w),
+                    vreinterpretq_s32_s16(v_these_mv_w)));
+
+      // If none of them are inside, then move on
+#if VPX_ARCH_AARCH64
+      horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
+#else
+      horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
+                             vget_high_u32(vreinterpretq_u32_s8(v_inside_d)));
+      horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0);
+      vst1_lane_u32(&horiz_max, horiz_max_1, 0);
+#endif
+      if (LIKELY(horiz_max == 0)) {
+        continue;
+      }
+
+      // The inverse mask indicates which of the MVs are outside
+      v_outside_d =
+          vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff)));
+      // Shift right to keep the sign bit clear, we will use this later
+      // to set the cost to the maximum value.
+      v_outside_d = vshrq_n_u32(v_outside_d, 1);
+
+      // Compute the difference MV
+      v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv);
+      // We utilise the fact that the cost function is even, and use the
+      // absolute difference. This allows us to use unsigned indexes later
+      // and reduces cache pressure somewhat as only a half of the table
+      // is ever referenced.
+      v_diff_mv_w = vabsq_s16(v_diff_mv_w);
+
+      // Compute the SIMD pointer offsets.
+      {
+#if VPX_ARCH_AARCH64  //  sizeof(intptr_t) == 8
+        // Load the offsets
+        int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
+        int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
+        // Set the ones falling outside to zero
+        v_bo10_q = vandq_s64(
+            v_bo10_q,
+            vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d))));
+        v_bo32_q = vandq_s64(
+            v_bo32_q,
+            vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d))));
+        // Compute the candidate addresses
+        v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q);
+        v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q);
+#else  // sizeof(intptr_t) == 4
+        int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]);
+        v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d));
+        v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d);
+#endif
+      }
+
+      sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                         in_what_stride, (uint32_t *)&v_sad_d);
+
+      // Look up the component cost of the residual motion vector
+      {
+        uint32_t cost[4];
+        DECLARE_ALIGNED(16, int16_t, rowcol[8]);
+        vst1q_s16(rowcol, v_diff_mv_w);
+
+        // Note: This is a use case for gather instruction
+        cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]];
+        cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]];
+        cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]];
+        cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]];
+
+        v_cost_d = vld1q_s32((int32_t *)cost);
+      }
+
+      // Now add in the joint cost
+      {
+        const uint32x4_t v_sel_d =
+            vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32);
+        const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8(
+            vbslq_u8(vreinterpretq_u8_u32(v_sel_d),
+                     vreinterpretq_u8_s32(v_joint_cost_0_d),
+                     vreinterpretq_u8_s32(v_joint_cost_1_d)));
+        v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d);
+      }
+
+      // Multiply by sad_per_bit
+      v_cost_d = vmulq_s32(v_cost_d, v_spb_d);
+      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+      v_cost_d =
+          vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1)));
+      v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT);
+      // Add the cost to the sad
+      v_sad_d = vaddq_s32(v_sad_d, v_cost_d);
+
+      // Make the motion vectors outside the search area have max cost
+      // by or'ing in the comparison mask, this way the minimum search won't
+      // pick them.
+      v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d));
+
+      // Find the minimum value and index horizontally in v_sad_d
+      {
+        uint32_t local_best_sad;
+#if VPX_ARCH_AARCH64
+        local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
+#else
+        uint32x2_t horiz_min_0 =
+            vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)),
+                     vget_high_u32(vreinterpretq_u32_s32(v_sad_d)));
+        uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+        vst1_lane_u32(&local_best_sad, horiz_min_1, 0);
+#endif
+
+        // Update the global minimum if the local minimum is smaller
+        if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+          uint32_t local_best_idx;
+          const uint32x4_t v_sel_d =
+              vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad));
+          uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
+          v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
+
+#if VPX_ARCH_AARCH64
+          local_best_idx = vminvq_u32(v_mask_d);
+#else
+          horiz_min_0 =
+              vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d));
+          horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+          vst1_lane_u32(&local_best_idx, horiz_min_1, 0);
+#endif
+
+          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+          best_sad = local_best_sad;
+        }
+      }
+    }
+
+    bmv = new_bmv;
+    best_address = new_best_address;
+
+    v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+#if VPX_ARCH_AARCH64
+    v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+    v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+
+    if (UNLIKELY(best_address == in_what)) {
+      (*num00)++;
+    }
+  }
+
+  *best_mv = bmv.as_mv;
+  return best_sad;
+}
diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c
index 1c7503139e..0cf0bf250e 100644
--- a/vp9/encoder/arm/neon/vp9_error_neon.c
+++ b/vp9/encoder/arm/neon/vp9_error_neon.c
@@ -12,30 +12,91 @@
 #include <assert.h>
 
 #include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
-  int64x2_t error = vdupq_n_s64(0);
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
 
-  assert(block_size >= 8);
-  assert((block_size % 8) == 0);
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
 
   do {
-    const int16x8_t c = vld1q_s16(coeff);
-    const int16x8_t d = vld1q_s16(dqcoeff);
-    const int16x8_t diff = vsubq_s16(c, d);
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    uint32x4_t err;
+    int32x4_t ssz0, ssz1;
+
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before
     // accumulating them in 64-bits.
-    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
-    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
-    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
-    error = vaddq_s64(error, err2);
-    coeff += 8;
-    dqcoeff += 8;
-    block_size -= 8;
+    err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+    err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+    err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64 = vpadalq_u32(err_u64, err);
+
+    // We can't do the same here as we're operating on signed integers, so we
+    // can store 2 15-bit diff before accumulating into 64-bits.
+    ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+    ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+    ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+    ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  *ssz = horizontal_add_int64x2(ssz_s64);
+  return (int64_t)horizontal_add_uint64x2(err_u64);
+}
+
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff,
+                                const tran_low_t *dqcoeff, int block_size) {
+  uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    uint32x4_t err0, err1;
+
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits
+    // before accumulating them in 64-bits. However splitting into 2 mull, mlal
+    // pairs is beneficial since it allows us to use both Neon
+    // multiply-accumulate pipes - on CPUs that have them - rather than having
+    // a single chain of 4 instructions executing serially.
+    err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0));
+    err_u64[0] = vpadalq_u32(err_u64[0], err0);
+
+    err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1));
+    err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64[1] = vpadalq_u32(err_u64[1], err1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
   } while (block_size != 0);
 
-  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+  return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1]));
 }
diff --git a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
index e46f789bac..bc8dd4a341 100644
--- a/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
+++ b/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_dsp/vpx_filter.h"
@@ -710,8 +711,8 @@ void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
-  const int dst_uv_w = dst_w / 2;
-  const int dst_uv_h = dst_h / 2;
+  const int dst_uv_w = dst->uv_crop_width;
+  const int dst_uv_h = dst->uv_crop_height;
   int scaled = 0;
 
   // phase_scaler is usually 0 or 8.
diff --git a/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
new file mode 100644
index 0000000000..d9b183472d
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz, int bd) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+  const int shift = 2 * (bd - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int32x4_t c = load_tran_low_to_s32q(coeff);
+    const int32x4_t d = load_tran_low_to_s32q(dqcoeff);
+
+    const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+    err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+    err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+    ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+    ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+    coeff += 4;
+    dqcoeff += 4;
+    block_size -= 4;
+  } while (block_size != 0);
+
+  *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift;
+  return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift;
+}
diff --git a/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
new file mode 100644
index 0000000000..c3aef3c865
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
@@ -0,0 +1,872 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const uint16x8_t a_reg = vld1q_u16(a);
+  const uint16x8_t b_reg = vld1q_u16(b);
+
+  uint16x8_t dist = vabdq_u16(a_reg, b_reg);
+  uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist));
+  uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist));
+
+  vst1q_u32(dst, dist_first);
+  vst1q_u32(dst + 4, dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) {
+  uint32x4_t dist_reg, dist_left, dist_right;
+
+  dist_reg = vld1q_u32(dist);
+  dist_left = vld1q_u32(dist - 1);
+  dist_right = vld1q_u32(dist + 1);
+
+  *sum = vaddq_u32(dist_reg, dist_left);
+  *sum = vaddq_u32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first,
+                                    uint32x4_t *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum,
+                                    const uint32x4_t *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32);
+  const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32);
+  const uint32x4_t weight_u32 = vdupq_n_u32(weight);
+  const uint32x4_t sixteen = vdupq_n_u32(16);
+  uint32x4_t sum2;
+
+  // modifier * 3 / index;
+  uint64x2_t sum_lo =
+      vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants));
+  uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum),
+                                vget_high_u32(*mul_constants));
+
+  // we cannot use vshrn_n_u64 as strength is not known at compile time.
+  sum_lo = vshlq_u64(sum_lo, strength_s64);
+  sum_hi = vshlq_u64(sum_hi, strength_s64);
+
+  sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi));
+
+  // Multiply with the weight
+  sum2 = vminq_u32(sum2, sixteen);
+  sum2 = vsubq_u32(sixteen, sum2);
+  *output = vmulq_u32(sum2, weight_u32);
+}
+
+static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1,
+                                    const uint32x4_t sum_0_u32,
+                                    const uint32x4_t sum_1_u32,
+                                    const uint32x4_t *mul_constants_0,
+                                    const uint32x4_t *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(
+    const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32,
+    const uint16_t *pred, uint16_t *count, uint32_t *accumulator) {
+  const uint16x8_t sum_u16 =
+      vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32));
+  uint16x8_t pred_u16 = vld1q_u16(pred);
+  uint16x8_t count_u16 = vld1q_u16(count);
+  uint32x4_t pred_0_u32, pred_1_u32;
+  uint32x4_t accum_0_u32, accum_1_u32;
+
+  count_u16 = vqaddq_u16(count_u16, sum_u16);
+  vst1q_u16(count, count_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+
+  pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16));
+  pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16));
+
+  // Don't use sum_u16 as that produces different results to the C version
+  accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32);
+  accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32);
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist,
+                                      uint32x4_t *dist_reg) {
+  *dist_reg = vld1q_u32(dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist,
+                                      uint32x4_t *reg_first,
+                                      uint32x4_t *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist,
+    uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first,
+    uint32x4_t *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    uint32x4_t u_reg, v_reg;
+    uint32x4x2_t pair;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    pair = vzipq_u32(u_reg, u_reg);
+    *u_first = pair.val[0];
+    *u_second = pair.val[1];
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    pair = vzipq_u32(v_reg, v_reg);
+    *v_first = pair.val[0];
+    *v_second = pair.val[1];
+  }
+}
+
+static void highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  uint32x4_t mul_first, mul_second;
+
+  uint32x4_t sum_row_1_first, sum_row_1_second;
+  uint32x4_t sum_row_2_first, sum_row_2_second;
+  uint32x4_t sum_row_3_first, sum_row_3_second;
+
+  uint32x4_t u_first, u_second;
+  uint32x4_t v_first, v_second;
+
+  uint32x4_t sum_row_first;
+  uint32x4_t sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = vld1q_u32(neighbors_first[0]);
+  mul_second = vld1q_u32(neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = vaddq_u32(sum_row_first, u_first);
+  sum_row_second = vaddq_u32(sum_row_second, u_second);
+
+  sum_row_first = vaddq_u32(sum_row_first, v_first);
+  sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                   sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = vld1q_u32(neighbors_first[1]);
+  mul_second = vld1q_u32(neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first);
+    sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = vaddq_u32(sum_row_first, u_first);
+    sum_row_second = vaddq_u32(sum_row_second, u_second);
+    sum_row_first = vaddq_u32(sum_row_first, v_first);
+    sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                     sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = vld1q_u32(neighbors_first[0]);
+  mul_second = vld1q_u32(neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = vaddq_u32(sum_row_first, u_first);
+  sum_row_second = vaddq_u32(sum_row_second, u_second);
+  sum_row_first = vaddq_u32(sum_row_first, v_first);
+  sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                   sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void highbd_apply_temporal_filter_luma(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst,
+    uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) {
+  uint32x4_t y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    uint32x4_t y_fst, y_snd;
+    uint64x2_t y_fst64, y_snd64;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = vaddq_u32(y_fst, y_tmp_fst);
+      y_snd = vaddq_u32(y_snd, y_tmp_snd);
+    }
+
+    y_fst64 = vpaddlq_u32(y_fst);
+    y_snd64 = vpaddlq_u32(y_snd);
+    y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = vaddq_u32(y_fst, y_tmp_fst);
+      y_snd = vaddq_u32(y_snd, y_tmp_snd);
+    }
+
+    y_fst64 = vpaddlq_u32(y_fst);
+    y_snd64 = vpaddlq_u32(y_snd);
+    y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+  }
+
+  *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+    int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  uint32x4_t mul_fst, mul_snd;
+
+  uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  uint32x4_t u_sum_row_fst, v_sum_row_fst;
+  uint32x4_t u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = vld1q_u32(neighbors_fst[0]);
+  mul_snd = vld1q_u32(neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                     u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                     v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = vld1q_u32(neighbors_fst[1]);
+  mul_snd = vld1q_u32(neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                       u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                       v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = vld1q_u32(neighbors_fst[0]);
+  mul_snd = vld1q_u32(neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                     u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                     v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void highbd_apply_temporal_filter_chroma(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_neon(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+                                    block_height, ss_x, ss_y, strength, blk_fw,
+                                    use_whole_blk, y_accum, y_count, y_dist_ptr,
+                                    u_dist_ptr, v_dist_ptr);
+
+  highbd_apply_temporal_filter_chroma(
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index d75a481796..96d0614367 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -11,11 +11,13 @@
 #include <arm_neon.h>
 #include <assert.h>
 #include <math.h>
+#include <stdint.h>
 
 #include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -26,9 +28,8 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
-                                               const int16x8_t dequant,
-                                               tran_low_t *dqcoeff) {
+static VPX_FORCE_INLINE void calculate_dqcoeff_and_store(
+    const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) {
   const int32x4_t dqcoeff_0 =
       vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
   const int32x4_t dqcoeff_1 =
@@ -42,149 +43,135 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
+static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
+                                                   int16x8_t v_eobmax,
+                                                   uint16x8_t v_nz_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]);
+  const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan);
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if VPX_ARCH_AARCH64
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif  // VPX_ARCH_AARCH64
+}
+
+static VPX_FORCE_INLINE void load_fp_values(
+    const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr,
+    int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) {
+  *round = vld1q_s16(mb_plane->round_fp);
+  *quant = vld1q_s16(mb_plane->quant_fp);
+  *dequant = vld1q_s16(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
+                                              int16x8_t *v_quant,
+                                              int16x8_t *v_dequant) {
+#if VPX_ARCH_AARCH64
+  *v_round = vdupq_laneq_s16(*v_round, 1);
+  *v_quant = vdupq_laneq_s16(*v_quant, 1);
+  *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
+#else
+  *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1);
+  *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1);
+  *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_8(
+    const int16x8_t *v_round, const int16x8_t *v_quant,
+    const int16x8_t *v_dequant, const tran_low_t *coeff_ptr,
+    const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    int16x8_t *v_eobmax) {
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round);
+  const int32x4_t v_tmp_lo =
+      vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant));
+  const int32x4_t v_tmp_hi =
+      vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant));
+  const int16x8_t v_tmp2 =
+      vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+  const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+  calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr);
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+  *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const struct macroblock_plane *mb_plane,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const struct ScanOrder *const scan_order) {
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   int i;
-  const int16x8_t v_zero = vdupq_n_s16(0);
-  const int16x8_t v_one = vdupq_n_s16(1);
-  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
-  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
-  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
-  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-
-  (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
-
-  // adjust for dc
-  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
-  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
-  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+  int16x8_t v_round, v_quant, v_dequant;
+  const int16_t *iscan = scan_order->iscan;
+
+  load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant);
   // process dc and the first seven ac coeffs
-  {
-    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
-    v_round = vmovq_n_s16(round_ptr[1]);
-    v_quant = vmovq_n_s16(quant_ptr[1]);
-    v_dequant = vmovq_n_s16(dequant_ptr[1]);
-  }
+  quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
+                dqcoeff_ptr, &v_eobmax);
+
   // now process the rest of the ac coeffs
-  for (i = 8; i < count; i += 8) {
-    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
+  update_fp_values(&v_round, &v_quant, &v_dequant);
+  for (i = 8; i < n_coeffs; i += 8) {
+    quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
+                  qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
   }
-#ifdef __aarch64__
-  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
-#else
-  {
-    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
-                                             vget_high_s16(v_eobmax_76543210));
-    const int64x1_t v_eobmax_xx32 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
-    const int16x4_t v_eobmax_tmp =
-        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
-    const int64x1_t v_eobmax_xxx3 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
-    const int16x4_t v_eobmax_final =
-        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-
-    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-  }
-#endif  // __aarch64__
+
+  *eob_ptr = get_max_eob(v_eobmax);
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
 }
 
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                                int skip_block, const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
-  const int16x8_t neg_one = vdupq_n_s16(-1);
+static VPX_FORCE_INLINE void quantize_fp_32x32_8(
+    const int16x8_t *v_round, const int16x8_t *v_quant,
+    const int16x8_t *v_dequant, const int16x8_t *dequant_thresh,
+    const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) {
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_coeff_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_thr_mask =
+      vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh));
+  const int16x8_t v_tmp_rnd =
+      vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask);
+  const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant);
+  const int16x8_t v_qcoeff =
+      vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0));
 
-  // ROUND_POWER_OF_TWO(round_ptr[], 1)
-  const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  const int16x8_t quant = vld1q_s16(quant_ptr);
-  const int16x4_t dequant = vld1_s16(dequant_ptr);
-  // dequant >> 2 is used similar to zbin as a threshold.
-  const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
-
-  // Process dc and the first seven ac coeffs.
-  const uint16x8_t v_iscan =
-      vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-  const int16x8_t coeff_abs = vabsq_s16(coeff);
-  const int16x8_t dequant_mask =
-      vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
-
-  int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
   int32x4_t dqcoeff_0, dqcoeff_1;
-  uint16x8_t eob_max;
-  (void)scan;
-  (void)count;
-  (void)skip_block;
-  assert(!skip_block);
-
-  // coeff * quant_ptr[]) >> 15
-  qcoeff = vqdmulhq_s16(qcoeff, quant);
-
-  // Restore sign.
-  qcoeff = veorq_s16(qcoeff, coeff_sign);
-  qcoeff = vsubq_s16(qcoeff, coeff_sign);
-  qcoeff = vandq_s16(qcoeff, dequant_mask);
-
-  // qcoeff * dequant[] / 2
-  dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
-  dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
-
+  dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant));
+  dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant));
   // Add 1 if negative to round towards zero because the C uses division.
   dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
   dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
 #if CONFIG_VP9_HIGHBITDEPTH
   vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
   vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
@@ -193,76 +180,224 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
                                                    vshrn_n_s32(dqcoeff_1, 1)));
 #endif
 
-  eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+  *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *mb_plane,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const struct ScanOrder *const scan_order) {
+  int16x8_t eob_max = vdupq_n_s16(-1);
+  // ROUND_POWER_OF_TWO(round_ptr[], 1)
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant_fp);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
+  // dequant >> 2 is used similar to zbin as a threshold.
+  int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+  int i;
+  const int16_t *iscan = scan_order->iscan;
+
+  (void)n_coeffs;
+
+  // Process dc and the first seven ac coeffs.
+  quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+                      iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
 
-  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+  update_fp_values(&round, &quant, &dequant);
+  dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1);
 
   iscan += 8;
   coeff_ptr += 8;
   qcoeff_ptr += 8;
   dqcoeff_ptr += 8;
 
-  {
-    int i;
-    const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
-    const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
-    const int16x8_t dequant_thresh =
-        vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
-
-    // Process the rest of the ac coeffs.
-    for (i = 8; i < 32 * 32; i += 8) {
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-      const int16x8_t dequant_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
-
-      int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
-      int32x4_t dqcoeff_0, dqcoeff_1;
-
-      qcoeff = vqdmulhq_s16(qcoeff, quant);
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-      qcoeff = vandq_s16(qcoeff, dequant_mask);
-
-      dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
-      dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
-
-      dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
-      dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+  // Process the rest of the ac coeffs.
+  for (i = 8; i < 32 * 32; i += 8) {
+    quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+                        iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-      vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
-      vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
-#else
-      store_s16q_to_tran_low(
-          dqcoeff_ptr,
-          vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
-#endif
+    iscan += 8;
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
 
-      eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                     tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+                     int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+  //  const int abs_qcoeff = (int)((tmp * quant) >> 16);
+  const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Packed nz_qcoeff_mask. Used to find eob.
+  return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
 
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const struct macroblock_plane *mb_plane,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const struct ScanOrder *const scan_order) {
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_round = vld1_s16(mb_plane->round_fp);
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+  const int16_t *iscan = scan_order->iscan;
+
+  // DC and first 3 AC
+  v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                   v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi =
+      highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  n_coeffs -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                     v_quant_s32, v_dequant_s32, v_round_s32);
+    v_mask_hi =
+        highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                             v_quant_s32, v_dequant_s32, v_round_s32);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    n_coeffs -= 8;
+  } while (n_coeffs);
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
 
-      iscan += 8;
-      coeff_ptr += 8;
-      qcoeff_ptr += 8;
-      dqcoeff_ptr += 8;
-    }
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+                           int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+  const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2);
+  const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+  // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+                                    vreinterpretq_s32_u32(v_mask));
+  //  const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+  const int32x4_t v_abs_qcoeff =
+      vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  // vshlq_s32 will shift right if shift value is negative.
+  const int32x4_t v_abs_dqcoeff =
+      vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Packed nz_qcoeff_mask. Used to find eob.
+  return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
 
-#ifdef __aarch64__
-    *eob_ptr = vmaxvq_u16(eob_max);
-#else
-    {
-      const uint16x4_t eob_max_0 =
-          vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
-      const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
-      const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
-      vst1_lane_u16(eob_ptr, eob_max_2, 0);
-    }
-#endif  // __aarch64__
-  }
+void vp9_highbd_quantize_fp_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const int16x4_t v_round =
+      vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14));
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+  const int16_t *iscan = scan_order->iscan;
+
+  // DC and first 3 AC
+  v_mask_lo =
+      highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                 v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi =
+      highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                                 v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  n_coeffs -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo =
+        highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                   v_quant_s32, v_dequant_s32, v_round_s32);
+    v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4,
+                                           dqcoeff_ptr + 4, v_quant_s32,
+                                           v_dequant_s32, v_round_s32);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    n_coeffs -= 8;
+  } while (n_coeffs);
+
+  *eob_ptr = get_max_eob(v_eobmax);
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
new file mode 100644
index 0000000000..a651a15d90
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
@@ -0,0 +1,849 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+                                uint16_t *dst) {
+  const uint8x8_t a_reg = vld1_u8(a);
+  const uint8x8_t b_reg = vld1_u8(b);
+
+  uint16x8_t dist_first = vabdl_u8(a_reg, b_reg);
+  dist_first = vmulq_u16(dist_first, dist_first);
+
+  vst1q_u16(dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+                                 uint16_t *dst) {
+  const uint8x16_t a_reg = vld1q_u8(a);
+  const uint8x16_t b_reg = vld1q_u8(b);
+
+  uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg));
+  uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg));
+  dist_first = vmulq_u16(dist_first, dist_first);
+  dist_second = vmulq_u16(dist_second, dist_second);
+
+  vst1q_u16(dst, dist_first);
+  vst1q_u16(dst + 8, dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) {
+  *dist_reg = vld1q_u16(dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first,
+                                uint16x8_t *reg_second) {
+  read_dist_8(dist, reg_first);
+  read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE uint16x8_t average_8(uint16x8_t sum,
+                                   const uint16x8_t *mul_constants,
+                                   const int strength, const int rounding,
+                                   const uint16x8_t *weight) {
+  const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16);
+  const uint16x8_t weight_u16 = *weight;
+  const uint16x8_t sixteen = vdupq_n_u16(16);
+  const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16);
+
+  // modifier * 3 / index;
+  uint32x4_t sum_hi =
+      vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants));
+  uint32x4_t sum_lo =
+      vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants));
+
+  sum_lo = vqaddq_u32(sum_lo, rounding_u32);
+  sum_hi = vqaddq_u32(sum_hi, rounding_u32);
+
+  // we cannot use vshrn_n_u32 as strength is not known at compile time.
+  sum_lo = vshlq_u32(sum_lo, strength_u32);
+  sum_hi = vshlq_u32(sum_hi, strength_u32);
+
+  sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo));
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = vminq_u16(sum, sixteen);
+  sum = vsubq_u16(sixteen, sum);
+  return vmulq_u16(sum, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const uint16x8_t sum_u16,
+                                   const uint8_t *pred, uint16_t *count,
+                                   uint32_t *accumulator) {
+  uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred));
+  uint16x8_t count_u16 = vld1q_u16(count);
+  uint32x4_t accum_0_u32, accum_1_u32;
+
+  count_u16 = vqaddq_u16(count_u16, sum_u16);
+  vst1q_u16(count, count_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+
+  accum_0_u32 =
+      vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16));
+  accum_1_u32 =
+      vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16));
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16,
+                                           const uint16x8_t sum_1_u16,
+                                           const uint8_t *pred, uint16_t *count,
+                                           uint32_t *accumulator) {
+  uint8x16_t pred_u8 = vld1q_u8(pred);
+  uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8));
+  uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8));
+  uint16x8_t count_0_u16 = vld1q_u16(count);
+  uint16x8_t count_1_u16 = vld1q_u16(count + 8);
+  uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16);
+  vst1q_u16(count, count_0_u16);
+  count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16);
+  vst1q_u16(count + 8, count_1_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+  accum_2_u32 = vld1q_u32(accumulator + 8);
+  accum_3_u32 = vld1q_u32(accumulator + 12);
+
+  accum_0_u32 =
+      vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16));
+  accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16),
+                          vget_high_u16(pred_0_u16));
+  accum_2_u32 =
+      vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16));
+  accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16),
+                          vget_high_u16(pred_1_u16));
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+  vst1q_u32(accumulator + 8, accum_2_u32);
+  vst1q_u32(accumulator + 12, accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) {
+  uint16x8_t dist_reg, dist_left, dist_right;
+
+  dist_reg = vld1q_u16(y_dist);
+  dist_left = vld1q_u16(y_dist - 1);
+  dist_right = vld1q_u16(y_dist + 1);
+
+  *sum = vqaddq_u16(dist_reg, dist_left);
+  *sum = vqaddq_u16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first,
+                              uint16x8_t *sum_second) {
+  get_sum_8(y_dist, sum_first);
+  get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+                                           const uint16_t *v_dist,
+                                           uint16x8_t *u_first,
+                                           uint16x8_t *u_second,
+                                           uint16x8_t *v_first,
+                                           uint16x8_t *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 16 entries from chroma.
+    read_dist_16(u_dist, u_first, u_second);
+    read_dist_16(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    uint16x8_t u_reg, v_reg;
+    uint16x8x2_t pair;
+
+    read_dist_8(u_dist, &u_reg);
+
+    pair = vzipq_u16(u_reg, u_reg);
+    *u_first = pair.val[0];
+    *u_second = pair.val[1];
+
+    read_dist_8(v_dist, &v_reg);
+
+    pair = vzipq_u16(v_reg, v_reg);
+    *v_first = pair.val[0];
+    *v_second = pair.val[1];
+  }
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+                                                 int ss_x, int ss_y,
+                                                 uint16x8_t *u_mod,
+                                                 uint16x8_t *v_mod) {
+  uint16x8_t y_reg;
+  if (!ss_x) {
+    read_dist_8(y_dist, &y_reg);
+    if (ss_y == 1) {
+      uint16x8_t y_tmp;
+      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+      y_reg = vqaddq_u16(y_reg, y_tmp);
+    }
+  } else {
+    uint16x8_t y_first, y_second;
+    uint32x4_t y_first32, y_second32;
+
+    read_dist_16(y_dist, &y_first, &y_second);
+    if (ss_y == 1) {
+      uint16x8_t y_tmp_0, y_tmp_1;
+      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+      y_first = vqaddq_u16(y_first, y_tmp_0);
+      y_second = vqaddq_u16(y_second, y_tmp_1);
+    }
+
+    y_first32 = vpaddlq_u16(y_first);
+    y_second32 = vpaddlq_u16(y_second);
+
+    y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32));
+  }
+
+  *u_mod = vqaddq_u16(*u_mod, y_reg);
+  *v_mod = vqaddq_u16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_luma_16(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors_first,
+    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  uint16x8_t weight_first, weight_second;
+
+  uint16x8_t mul_first, mul_second;
+
+  uint16x8_t sum_row_1_first, sum_row_1_second;
+  uint16x8_t sum_row_2_first, sum_row_2_second;
+  uint16x8_t sum_row_3_first, sum_row_3_second;
+
+  uint16x8_t u_first, u_second;
+  uint16x8_t v_first, v_second;
+
+  uint16x8_t sum_row_first;
+  uint16x8_t sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(block_width == 16);
+  (void)block_width;
+
+  // Initialize the weights
+  if (blk_fw) {
+    weight_first = vdupq_n_u16(blk_fw[0]);
+    weight_second = vdupq_n_u16(blk_fw[1]);
+  } else {
+    weight_first = vdupq_n_u16(top_weight);
+    weight_second = weight_first;
+  }
+
+  // First row
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+  // Add luma values
+  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first);
+  sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                          &v_second);
+
+  sum_row_first = vqaddq_u16(sum_row_first, u_first);
+  sum_row_second = vqaddq_u16(sum_row_second, u_second);
+
+  sum_row_first = vqaddq_u16(sum_row_first, v_first);
+  sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      if (blk_fw) {
+        weight_first = vdupq_n_u16(blk_fw[2]);
+        weight_second = vdupq_n_u16(blk_fw[3]);
+      } else {
+        weight_first = vdupq_n_u16(bottom_weight);
+        weight_second = weight_first;
+      }
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+    sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first);
+    sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+                              &v_first, &v_second);
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = vqaddq_u16(sum_row_first, u_first);
+    sum_row_second = vqaddq_u16(sum_row_second, u_second);
+    sum_row_first = vqaddq_u16(sum_row_first, v_first);
+    sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+    // Get modifier and store result
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+    sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                               &weight_second);
+    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                            y_accum);
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+  sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                            &v_second);
+  }
+
+  sum_row_first = vqaddq_u16(sum_row_first, u_first);
+  sum_row_second = vqaddq_u16(sum_row_second, u_second);
+  sum_row_first = vqaddq_u16(sum_row_first, v_first);
+  sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void apply_temporal_filter_luma(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors_first;
+  const int16_t *const *neighbors_second;
+
+  if (block_width == 16) {
+    // Special Case: The block width is 16 and we are operating on a row of 16
+    // chroma pixels. In this case, we can't use the usual left-middle-right
+    // pattern. We also don't support splitting now.
+    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+    if (use_whole_blk) {
+      apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, top_weight, bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_chroma_8(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+
+  uint16x8_t weight;
+
+  uint16x8_t mul;
+
+  uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3;
+  uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+  uint16x8_t u_sum_row, v_sum_row;
+
+  // Loop variable
+  unsigned int h;
+
+  // Initialize weight
+  if (blk_fw) {
+    weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1]));
+  } else {
+    weight = vdupq_n_u16(top_weight);
+  }
+
+  // First row
+  mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+  // Add chroma values
+  get_sum_8(u_dist, &u_sum_row_2);
+  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+  u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3);
+
+  get_sum_8(v_dist, &v_sum_row_2);
+  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+  v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul = vld1q_u16((const uint16_t *)neighbors[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3]));
+      } else {
+        weight = vdupq_n_u16(bottom_weight);
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1 = u_sum_row_2;
+    u_sum_row_2 = u_sum_row_3;
+
+    v_sum_row_1 = v_sum_row_2;
+    v_sum_row_2 = v_sum_row_3;
+
+    // Add chroma values
+    u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+    u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3);
+
+    v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+    v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3);
+
+    // Add luma values
+    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+    // Get modifier and store result
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+  // Shift the rows up
+  u_sum_row_1 = u_sum_row_2;
+  u_sum_row_2 = u_sum_row_3;
+
+  v_sum_row_1 = v_sum_row_2;
+  v_sum_row_2 = v_sum_row_3;
+
+  // Add chroma values
+  u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void apply_temporal_filter_chroma(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    } else {
+      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+          bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_neon(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+  const int *blk_fw_ptr = blk_fw;
+
+  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                    y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                   u_dist_ptr + blk_col);
+      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                   v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+                             ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk,
+                             y_accum, y_count, y_dist_ptr, u_dist_ptr,
+                             v_dist_ptr);
+
+  apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width,
+                               block_height, ss_x, ss_y, strength, blk_fw_ptr,
+                               use_whole_blk, u_accum, u_count, v_accum,
+                               v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/vp9/encoder/ppc/vp9_quantize_vsx.c b/vp9/encoder/ppc/vp9_quantize_vsx.c
index 4f88b8fff6..4d31558471 100644
--- a/vp9/encoder/ppc/vp9_quantize_vsx.c
+++ b/vp9/encoder/ppc/vp9_quantize_vsx.c
@@ -39,11 +39,10 @@ static INLINE int16x8_t vec_max_across(int16x8_t a) {
 }
 
 void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *round_ptr,
-                         const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const int16_t *scan, const int16_t *iscan) {
   int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
   bool16x8_t zero_coeff0, zero_coeff1;
 
@@ -56,8 +55,6 @@ void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int16x8_t scan1 = vec_vsx_ld(16, iscan);
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // First set of 8 coeff starts with DC + 7 AC
   qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
@@ -165,7 +162,7 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
 }
 
 void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               int skip_block, const int16_t *round_ptr,
+                               const int16_t *round_ptr,
                                const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                                tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -194,9 +191,7 @@ void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int16x8_t abs_coeff1 = vec_abs(coeff1);
 
   (void)scan;
-  (void)skip_block;
   (void)n_coeffs;
-  assert(!skip_block);
 
   mask0 = vec_cmpge(abs_coeff0, thres);
   round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index bd3812036c..ef3423f8eb 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -87,7 +87,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
           &cpi->rc, cm->frame_type, cm->base_qindex,
           aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
 
-      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // For AQ complexity mode, we don't allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
       // Q delta is sometimes applied without going back around the rd loop.
       // This could lead to an illegal combination of partition size and q.
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 858a416546..28ab10a13b 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -48,6 +48,7 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
   cr->counter_encode_maxq_scene_change = 0;
+  cr->content_mode = 1;
   return cr;
 }
 
@@ -326,7 +327,8 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
   else
     rc->baseline_gf_interval = 40;
   if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20;
-  if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40)
+  if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40 &&
+      cr->content_mode)
     rc->baseline_gf_interval = 10;
 }
 
@@ -388,7 +390,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
           ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
           : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
   // More aggressive settings for noisy content.
-  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+      cr->content_mode) {
     consec_zero_mv_thresh = 60;
     qindex_thresh =
         VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
@@ -409,7 +412,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cpi->common.use_highbitdepth) compute_content = 0;
 #endif
-    if (cpi->Last_Source == NULL ||
+    if (cr->content_mode == 0 || cpi->Last_Source == NULL ||
         cpi->Last_Source->y_width != cpi->Source->y_width ||
         cpi->Last_Source->y_height != cpi->Source->y_height)
       compute_content = 0;
@@ -430,7 +433,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         // reset to 0 later depending on the coding mode.
         if (cr->map[bl_index2] == 0) {
           count_tot++;
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
+          if (cr->content_mode == 0 ||
+              cr->last_coded_q_map[bl_index2] > qindex_thresh ||
               cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) {
             sum_map++;
             count_sel++;
@@ -467,7 +471,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
   cr->sb_index = i;
   cr->reduce_refresh = 0;
   if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
-    if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1;
+    if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1;
 }
 
 // Set cyclic refresh parameters.
@@ -489,10 +493,13 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
       rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
       (cpi->use_svc &&
        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
-      (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
+      (!cpi->use_svc && cr->content_mode &&
+       rc->avg_frame_low_motion < thresh_low_motion &&
        rc->frames_since_key > 40) ||
       (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh &&
-       rc->frames_since_key > 20)) {
+       rc->frames_since_key > 20) ||
+      (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+       rc->frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)) {
     cr->apply_cyclic_refresh = 0;
     return;
   }
@@ -511,7 +518,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->rate_ratio_qdelta = 3.0;
   } else {
     cr->rate_ratio_qdelta = 2.0;
-    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+    if (cr->content_mode && cpi->noise_estimate.enabled &&
+        cpi->noise_estimate.level >= kMedium) {
       // Reduce the delta-qp if the estimated source noise is above threshold.
       cr->rate_ratio_qdelta = 1.7;
       cr->rate_boost_fac = 13;
@@ -528,7 +536,7 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10;
     // Increase the amount of refresh on scene change that is encoded at max Q,
     // increase for a few cycles of the refresh period (~100 / percent_refresh).
-    if (cr->counter_encode_maxq_scene_change < 30)
+    if (cr->content_mode && cr->counter_encode_maxq_scene_change < 30)
       cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15;
     cr->rate_ratio_qdelta = 2.0;
     cr->rate_boost_fac = 10;
@@ -550,7 +558,7 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->percent_refresh = 10;
     cr->rate_ratio_qdelta = 1.5;
     cr->rate_boost_fac = 10;
-    if (cpi->refresh_golden_frame == 1) {
+    if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) {
       cr->percent_refresh = 0;
       cr->rate_ratio_qdelta = 1.0;
     }
@@ -575,6 +583,12 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
         (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) /
         num8x8bl;
   cr->weight_segment = weight_segment;
+  if (cr->content_mode == 0) {
+    cr->actual_num_seg1_blocks =
+        cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+    cr->actual_num_seg2_blocks = 0;
+    cr->weight_segment = (double)(cr->actual_num_seg1_blocks) / num8x8bl;
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index b6d7fdeae7..c74cee4743 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -70,6 +70,7 @@ struct CYCLIC_REFRESH {
   int apply_cyclic_refresh;
   int counter_encode_maxq_scene_change;
   int skip_flat_static_blocks;
+  int content_mode;
 };
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3eff4ce830..ca56d14aa1 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -134,9 +134,9 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
   const TOKENEXTRA *p;
   const vp9_extra_bit *const extra_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (bit_depth == VPX_BITS_12)
-          ? vp9_extra_bits_high12
-          : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 : vp9_extra_bits;
+      (bit_depth == VPX_BITS_12)   ? vp9_extra_bits_high12
+      : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10
+                                   : vp9_extra_bits;
 #else
       vp9_extra_bits;
   (void)bit_depth;
@@ -169,8 +169,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
         vpx_write_bit(w, p->extra & 1);
       } else {  // t >= TWO_TOKEN && t < EOB_TOKEN
         const struct vp9_token *const a = &vp9_coef_encodings[t];
-        const int v = a->value;
-        const int n = a->len;
+        int v = a->value;
+        int n = a->len;
         const int e = p->extra;
         vpx_write(w, 1, context_tree[2]);
         vp9_write_tree(w, vp9_coef_con_tree,
@@ -179,8 +179,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
         if (t >= CATEGORY1_TOKEN) {
           const vp9_extra_bit *const b = &extra_bits[t];
           const unsigned char *pb = b->prob;
-          int v = e >> 1;
-          int n = b->len;  // number of bits in v, assumed nonzero
+          v = e >> 1;
+          n = b->len;  // number of bits in v, assumed nonzero
           do {
             const int bb = (v >> --n) & 1;
             vpx_write(w, bb, *pb++);
@@ -236,11 +236,11 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
   }
 }
 
-static void pack_inter_mode_mvs(
-    VP9_COMP *cpi, const MACROBLOCKD *const xd,
-    const MB_MODE_INFO_EXT *const mbmi_ext, vpx_writer *w,
-    unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd,
+                                const MB_MODE_INFO_EXT *const mbmi_ext,
+                                vpx_writer *w,
+                                unsigned int *const max_mv_magnitude,
+                                int interp_filter_selected[][SWITCHABLE]) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc->nmvc;
   const struct segmentation *const seg = &cm->seg;
@@ -373,11 +373,12 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]);
 }
 
-static void write_modes_b(
-    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
-    int mi_row, int mi_col, unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                          const TileInfo *const tile, vpx_writer *w,
+                          TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                          int mi_row, int mi_col,
+                          unsigned int *const max_mv_magnitude,
+                          int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   const MB_MODE_INFO_EXT *const mbmi_ext =
       cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
@@ -422,12 +423,12 @@ static void write_partition(const VP9_COMMON *const cm,
   }
 }
 
-static void write_modes_sb(
-    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
-    int mi_row, int mi_col, BLOCK_SIZE bsize,
-    unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, vpx_writer *w,
+                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                           int mi_row, int mi_col, BLOCK_SIZE bsize,
+                           unsigned int *const max_mv_magnitude,
+                           int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   const int bsl = b_width_log2_lookup[bsize];
   const int bs = (1 << bsl) / 4;
@@ -485,11 +486,10 @@ static void write_modes_sb(
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void write_modes(
-    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    vpx_writer *w, int tile_row, int tile_col,
-    unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                        const TileInfo *const tile, vpx_writer *w, int tile_row,
+                        int tile_col, unsigned int *const max_mv_magnitude,
+                        int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   int mi_row, mi_col, tile_sb_row;
   TOKENEXTRA *tok = NULL;
@@ -554,7 +554,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
   switch (cpi->sf.use_fast_coef_updates) {
     case TWO_LOOP: {
       /* dry run to see if there is any update at all needed */
-      int savings = 0;
+      int64_t savings = 0;
       int update[2] = { 0, 0 };
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
@@ -563,7 +563,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
-                int s;
+                int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
@@ -599,8 +599,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                const vpx_prob upd = DIFF_UPDATE_PROB;
-                int s;
+                int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
@@ -636,7 +635,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                int s;
+                int64_t s;
                 int u = 0;
 
                 if (t == PIVOT_NODE) {
@@ -963,21 +962,20 @@ void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
   }
 }
 
-static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
   int i;
   const size_t worker_data_size =
       cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
-  cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size);
+  CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data,
+                  vpx_memalign(16, worker_data_size));
   memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
-  if (!cpi->vp9_bitstream_worker_data) return 1;
   for (i = 1; i < cpi->num_workers; ++i) {
     cpi->vp9_bitstream_worker_data[i].dest_size =
         cpi->oxcf.width * cpi->oxcf.height;
-    cpi->vp9_bitstream_worker_data[i].dest =
-        vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size);
-    if (!cpi->vp9_bitstream_worker_data[i].dest) return 1;
+    CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest,
+                    vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size));
   }
-  return 0;
 }
 
 static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
@@ -992,7 +990,7 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
       cpi->vp9_bitstream_worker_data[1].dest_size >
           (cpi->oxcf.width * cpi->oxcf.height)) {
     vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
-    if (encode_tiles_buffer_alloc(cpi)) return 0;
+    encode_tiles_buffer_alloc(cpi);
   }
 
   while (tile_col < tile_cols) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 37a4605ad8..7fa00cd194 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -13,6 +13,7 @@
 
 #include "vpx_util/vpx_thread.h"
 
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 
@@ -24,7 +25,7 @@ typedef struct {
   unsigned int sse;
   int sum;
   unsigned int var;
-} diff;
+} Diff;
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
@@ -33,8 +34,8 @@ struct macroblock_plane {
   uint16_t *eobs;
   struct buf_2d src;
 
-  // Quantizer setings
-  DECLARE_ALIGNED(16, int16_t, round_fp[8]);
+  // Quantizer settings
+  int16_t *round_fp;
   int16_t *quant_fp;
   int16_t *quant;
   int16_t *quant_shift;
@@ -78,16 +79,16 @@ struct macroblock {
   int skip_recode;
   int skip_optimize;
   int q_index;
-  int block_qcoeff_opt;
+  double log_block_src_var;
   int block_tx_domain;
 
   // The equivalent error at the current rdmult of one whole bit (not one
   // bitcost unit).
   int errorperbit;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // The equivalent SAD error of one (whole) bit at the current quantizer
   // for large blocks.
   int sadperbit16;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // The equivalent SAD error of one (whole) bit at the current quantizer
   // for sub-8x8 blocks.
   int sadperbit4;
   int rddiv;
@@ -127,7 +128,7 @@ struct macroblock {
   // from extending outside the UMV borders
   MvLimits mv_limits;
 
-  // Notes transform blocks where no coefficents are coded.
+  // Notes transform blocks where no coefficients are coded.
   // Set during mode selection. Read during block encoding.
   uint8_t zcoeff_blk[TX_SIZES][256];
 
@@ -157,6 +158,9 @@ struct macroblock {
   // skip forward transform and quantization
   uint8_t skip_txfm[MAX_MB_PLANE << 2];
 #define SKIP_TXFM_NONE 0
+// TODO(chengchen): consider remove SKIP_TXFM_AC_DC from vp9 completely
+// since it increases risks of bad perceptual quality.
+// https://crbug.com/webm/1729
 #define SKIP_TXFM_AC_DC 1
 #define SKIP_TXFM_AC_ONLY 2
 
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index b74b9027ca..42073f756c 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -25,16 +25,17 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
   int i, k;
   ctx->num_4x4_blk = num_blk;
 
-  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t)));
+  CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk,
+                  vpx_calloc(num_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
-      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k],
                       vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
@@ -100,10 +101,10 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
   int nodes;
 
   vpx_free(td->leaf_tree);
-  CHECK_MEM_ERROR(cm, td->leaf_tree,
+  CHECK_MEM_ERROR(&cm->error, td->leaf_tree,
                   vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
   vpx_free(td->pc_tree);
-  CHECK_MEM_ERROR(cm, td->pc_tree,
+  CHECK_MEM_ERROR(&cm->error, td->pc_tree,
                   vpx_calloc(tree_nodes, sizeof(*td->pc_tree)));
 
   this_pc = &td->pc_tree[0];
diff --git a/vp9/encoder/vp9_cost.h b/vp9/encoder/vp9_cost.h
index 638d72a916..ee0033fa31 100644
--- a/vp9/encoder/vp9_cost.h
+++ b/vp9/encoder/vp9_cost.h
@@ -29,9 +29,8 @@ extern const uint16_t vp9_prob_cost[256];
 
 #define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) : (prob))
 
-static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          vpx_prob p) {
-  return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
+static INLINE uint64_t cost_branch256(const unsigned int ct[2], vpx_prob p) {
+  return (uint64_t)ct[0] * vp9_cost_zero(p) + (uint64_t)ct[1] * vp9_cost_one(p);
 }
 
 static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs, int bits,
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 2885223b59..e5dffa90a8 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -233,7 +233,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
         frame == ALTREF_FRAME ||
         (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
         (frame != LAST_FRAME &&
-         ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
+         ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
           denoiser->denoising_level >= kDenHigh))) {
       frame = LAST_FRAME;
       ctx->newmv_sse = ctx->zeromv_lastref_sse;
@@ -319,7 +319,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
   filter_mbd->plane[2].dst.stride =
       denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
 
-  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
+  set_ref_ptrs(cm, filter_mbd, saved_frame, NO_REF_FRAME);
   vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
 
   // Restore everything to its original state
@@ -387,7 +387,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
           consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv);
           // No need to keep checking 8x8 blocks if any of the sub-blocks
           // has small consec_zeromv (since threshold for no_skin based on
-          // zero/small motion in skin detection is high, i.e, > 4).
+          // zero/small motion in skin detection is high, i.e., > 4).
           if (consec_zeromv < 4) {
             i = ymis;
             break;
@@ -634,11 +634,11 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
   denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
   init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
   denoiser->num_layers = num_layers;
-  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+  CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y,
                   vpx_calloc(denoiser->num_ref_frames * num_layers,
                              sizeof(denoiser->running_avg_y[0])));
   CHECK_MEM_ERROR(
-      cm, denoiser->mc_running_avg_y,
+      &cm->error, denoiser->mc_running_avg_y,
       vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
 
   for (layer = 0; layer < num_layers; ++layer) {
@@ -764,8 +764,9 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold,
                                 VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
                                 int temporal_layer_id) {
   if (noise_level >= kDenLow && abs_sumdiff < 5)
-    return threshold *=
-           (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6;
+    return threshold *= (noise_level == kDenLow)   ? 2
+                        : (temporal_layer_id == 2) ? 10
+                                                   : 6;
   else
     return threshold;
 }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index dcd6476581..46291f4868 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -48,6 +48,7 @@
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_multi_thread.h"
@@ -159,37 +160,6 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if !CONFIG_REALTIME_ONLY
-static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
-                                                   const struct buf_2d *ref,
-                                                   int mi_row, int mi_col,
-                                                   BLOCK_SIZE bs) {
-  unsigned int sse, var;
-  uint8_t *last_y;
-  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
-
-  assert(last != NULL);
-  last_y =
-      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
-  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
-}
-
-static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x,
-                                                   int mi_row, int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(
-      cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
-  if (var < 8)
-    return BLOCK_64X64;
-  else if (var < 128)
-    return BLOCK_32X32;
-  else if (var < 2048)
-    return BLOCK_16X16;
-  else
-    return BLOCK_8X8;
-}
-#endif  // !CONFIG_REALTIME_ONLY
-
 static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int segment_index) {
   VP9_COMMON *const cm = &cpi->common;
@@ -248,8 +218,8 @@ static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row,
       break;
   }
 
-  // Set segment index from ROI map if it's enabled.
-  if (cpi->roi.enabled)
+  // Set segment index if ROI map or active_map is enabled.
+  if (cpi->roi.enabled || cpi->active_map.enabled)
     mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
 
   vp9_init_plane_quantizers(cpi, x);
@@ -380,17 +350,17 @@ typedef struct {
   int32_t sum_error;
   int log2_count;
   int variance;
-} var;
+} Var;
 
 typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
+  Var none;
+  Var horz[2];
+  Var vert[2];
 } partition_variance;
 
 typedef struct {
   partition_variance part_variances;
-  var split[4];
+  Var split[4];
 } v4x4;
 
 typedef struct {
@@ -415,7 +385,7 @@ typedef struct {
 
 typedef struct {
   partition_variance *part_variances;
-  var *split[4];
+  Var *split[4];
 } variance_node;
 
 typedef enum {
@@ -467,13 +437,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
 }
 
 // Set variance values given sum square error, sum error, count.
-static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
+static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->log2_count = c;
 }
 
-static void get_variance(var *v) {
+static void get_variance(Var *v) {
   v->variance =
       (int)(256 * (v->sum_square_error -
                    (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
@@ -481,7 +451,7 @@ static void get_variance(var *v) {
             v->log2_count);
 }
 
-static void sum_2_variances(const var *a, const var *b, var *r) {
+static void sum_2_variances(const Var *a, const Var *b, Var *r) {
   assert(a->log2_count == b->log2_count);
   fill_variance(a->sum_square_error + b->sum_square_error,
                 a->sum_error + b->sum_error, a->log2_count + 1, r);
@@ -815,8 +785,8 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
 
 // Check if most of the superblock is skin content, and if so, force split to
 // 32x32, and set x->sb_is_skin for use in mode selection.
-static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
-                         int mi_row, int mi_col, int *force_split) {
+static int skin_sb_split(VP9_COMP *cpi, const int low_res, int mi_row,
+                         int mi_col, int *force_split) {
   VP9_COMMON *const cm = &cpi->common;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) return 0;
@@ -828,11 +798,6 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
                    mi_row + 8 < cm->mi_rows)) {
     int num_16x16_skin = 0;
     int num_16x16_nonskin = 0;
-    uint8_t *ysignal = x->plane[0].src.buf;
-    uint8_t *usignal = x->plane[1].src.buf;
-    uint8_t *vsignal = x->plane[2].src.buf;
-    int sp = x->plane[0].src.stride;
-    int spuv = x->plane[1].src.stride;
     const int block_index = mi_row * cm->mi_cols + mi_col;
     const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
     const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
@@ -851,13 +816,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
           i = ymis;
           break;
         }
-        ysignal += 16;
-        usignal += 8;
-        vsignal += 8;
       }
-      ysignal += (sp << 4) - 64;
-      usignal += (spuv << 3) - 32;
-      vsignal += (spuv << 3) - 32;
     }
     if (num_16x16_skin > 12) {
       *force_split = 1;
@@ -1341,8 +1300,15 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   // the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
   int is_key_frame =
       (frame_is_intra_only(cm) ||
-       (is_one_pass_cbr_svc(cpi) &&
+       (is_one_pass_svc(cpi) &&
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+
+  if (!is_key_frame) {
+    if (cm->frame_refs[LAST_FRAME - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+        cm->frame_refs[LAST_FRAME - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+      is_key_frame = 1;
+  }
+
   // Always use 4x4 partition for key frame.
   const int use_4x4_partition = frame_is_intra_only(cm);
   const int low_res = (cm->width <= 352 && cm->height <= 288);
@@ -1448,7 +1414,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 
     assert(yv12 != NULL);
 
-    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+    if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
         cpi->svc.use_gf_temporal_ref_current_layer) {
       // For now, GOLDEN will not be used for non-zero spatial layers, since
       // it may not be a temporal reference.
@@ -1479,7 +1445,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
                            &cm->frame_refs[LAST_FRAME - 1].sf);
       mi->ref_frame[0] = LAST_FRAME;
     }
-    mi->ref_frame[1] = NONE;
+    mi->ref_frame[1] = NO_REF_FRAME;
     mi->sb_type = BLOCK_64X64;
     mi->mv[0].as_int = 0;
     mi->interp_filter = BILINEAR;
@@ -1534,8 +1500,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
     if (cpi->use_skin_detection)
-      x->sb_is_skin =
-          skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
+      x->sb_is_skin = skin_sb_split(cpi, low_res, mi_row, mi_col, force_split);
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
@@ -1588,7 +1553,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   }
 
   if (low_res && threshold_4x4avg < INT64_MAX)
-    CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2)));
+    CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2)));
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   for (i = 0; i < 4; i++) {
@@ -1749,7 +1714,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
           const int y16_idx = ((j >> 1) << 1);
           // For inter frames: if variance4x4downsample[] == 1 for this 16x16
           // block, then the variance is based on 4x4 down-sampling, so use vt2
-          // in set_vt_partioning(), otherwise use vt.
+          // in set_vt_partitioning(), otherwise use vt.
           v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1)
                               ? &vt2[i2 + j]
                               : &vt.split[i].split[j];
@@ -1842,7 +1807,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode) {
       vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row, mi_col, bsize,
                                         ctx->rate, ctx->dist, x->skip, p);
     }
@@ -1905,8 +1871,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
       vp9_update_mv_count(td);
 
       if (cm->interp_filter == SWITCHABLE) {
-        const int ctx = get_pred_context_switchable_interp(xd);
-        ++td->counts->switchable_interp[ctx][xdmi->interp_filter];
+        const int ctx_interp = get_pred_context_switchable_interp(xd);
+        ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter];
       }
     }
 
@@ -1947,13 +1913,17 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
 }
 
 static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+                                   INTERP_FILTER interp_filter,
                                    RD_COST *rd_cost, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   INTERP_FILTER filter_ref;
 
   filter_ref = get_pred_context_switchable_interp(xd);
-  if (filter_ref == SWITCHABLE_FILTERS) filter_ref = EIGHTTAP;
+  if (interp_filter == BILINEAR)
+    filter_ref = BILINEAR;
+  else if (filter_ref == SWITCHABLE_FILTERS)
+    filter_ref = EIGHTTAP;
 
   mi->sb_type = bsize;
   mi->mode = ZEROMV;
@@ -1962,7 +1932,7 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
   mi->skip = 1;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->mv[0].as_int = 0;
   mi->interp_filter = filter_ref;
 
@@ -2018,6 +1988,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t best_rd = INT64_MAX;
 
   vpx_clear_system_state();
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
@@ -2056,20 +2029,20 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   // Save rdmult before it might be changed, so it can be restored later.
   orig_rdmult = x->rdmult;
 
-  if ((cpi->sf.tx_domain_thresh > 0.0) || (cpi->sf.quant_opt_thresh > 0.0)) {
+  if ((cpi->sf.tx_domain_thresh > 0.0) ||
+      (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) {
     double logvar = vp9_log_block_var(cpi, x, bsize);
-    // Check block complexity as part of descision on using pixel or transform
+    // Check block complexity as part of decision on using pixel or transform
     // domain distortion in rd tests.
     x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion &&
                          (logvar >= cpi->sf.tx_domain_thresh);
 
-    // Check block complexity as part of descision on using quantized
-    // coefficient optimisation inside the rd loop.
-    x->block_qcoeff_opt =
-        cpi->sf.allow_quant_coeff_opt && (logvar <= cpi->sf.quant_opt_thresh);
+    // Store block complexity to decide on using quantized coefficient
+    // optimization inside the rd loop.
+    x->log_block_src_var = logvar;
   } else {
     x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion;
-    x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt;
+    x->log_block_src_var = 0.0;
   }
 
   set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
@@ -2085,15 +2058,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
   } else {
     if (bsize >= BLOCK_8X8) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
       if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
         vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
         vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                   bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
     } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
       vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                     bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
     }
   }
 
@@ -2116,6 +2101,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   ctx->rate = rd_cost->rate;
   ctx->dist = rd_cost->dist;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -2452,16 +2440,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
       (row8x8_remaining >= MI_BLOCK_SIZE)) {
     int i, j;
     int index;
-    diff d32[4];
+    Diff d32[4];
     const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1);
     int is_larger_better = 0;
     int use32x32 = 0;
     unsigned int thr = cpi->source_var_thresh;
 
-    memset(d32, 0, 4 * sizeof(diff));
+    memset(d32, 0, sizeof(d32));
 
     for (i = 0; i < 4; i++) {
-      diff *d16[4];
+      Diff *d16[4];
 
       for (j = 0; j < 4; j++) {
         int b_mi_row = coord_lookup[i * 4 + j].row;
@@ -2537,9 +2525,11 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
   *(xd->mi[0]) = ctx->mic;
   *(x->mbmi_ext) = ctx->mbmi_ext;
 
-  if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled)) {
+  if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled ||
+                       cpi->active_map.enabled)) {
     // Setting segmentation map for cyclic_refresh.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode) {
       vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
                                         ctx->rate, ctx->dist, x->skip, p);
     } else {
@@ -2766,10 +2756,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + (mi_step >> 1) < cm->mi_rows) {
         RD_COST tmp_rdc;
-        PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+        PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
         pc_tree->horizontal[1].skip_ref_frame_mask = 0;
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col,
                          &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX,
@@ -2790,10 +2780,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + (mi_step >> 1) < cm->mi_cols) {
         RD_COST tmp_rdc;
-        PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
+        PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx);
         pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0;
         rd_pick_sb_modes(
             cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
@@ -2865,8 +2855,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       int x_idx = (i & 1) * (mi_step >> 1);
       int y_idx = (i >> 1) * (mi_step >> 1);
       RD_COST tmp_rdc;
-      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl[8], sa[8];
 
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
@@ -3072,14 +3060,12 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row,
   min_size = BLOCK_64X64;
   max_size = BLOCK_4X4;
 
-  if (prev_mi) {
-    for (idy = 0; idy < mi_height; ++idy) {
-      for (idx = 0; idx < mi_width; ++idx) {
-        mi = prev_mi[idy * cm->mi_stride + idx];
-        bs = mi ? mi->sb_type : bsize;
-        min_size = VPXMIN(min_size, bs);
-        max_size = VPXMAX(max_size, bs);
-      }
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      mi = prev_mi[idy * cm->mi_stride + idx];
+      bs = mi ? mi->sb_type : bsize;
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
     }
   }
 
@@ -3119,54 +3105,6 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
-#if CONFIG_FP_MB_STATS
-const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        1, 2, 2, 2, 4, 4 };
-const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        2, 1, 2, 4, 2, 4 };
-const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { 0,   10,  10, 30, 40,
-                                                        40,  60,  80, 80, 90,
-                                                        100, 100, 120 };
-const int qindex_split_threshold_lookup[BLOCK_SIZES] = { 0,  3,  3,  7,  15,
-                                                         15, 30, 40, 40, 60,
-                                                         80, 80, 120 };
-const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { 1, 1, 1, 1, 1,
-                                                             1, 1, 1, 1, 1,
-                                                             4, 4, 6 };
-
-typedef enum {
-  MV_ZERO = 0,
-  MV_LEFT = 1,
-  MV_UP = 2,
-  MV_RIGHT = 3,
-  MV_DOWN = 4,
-  MV_INVALID
-} MOTION_DIRECTION;
-
-static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
-  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
-    return MV_ZERO;
-  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
-    return MV_LEFT;
-  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
-    return MV_RIGHT;
-  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
-    return MV_UP;
-  } else {
-    return MV_DOWN;
-  }
-}
-
-static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
-                                           MOTION_DIRECTION that_mv) {
-  if (this_mv == that_mv) {
-    return 0;
-  } else {
-    return abs(this_mv - that_mv) == 2 ? 2 : 1;
-  }
-}
-#endif
-
 // Calculate prediction based on the given input features and neural net config.
 // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
 // layer.
@@ -3273,7 +3211,7 @@ static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
       left_par = 1;
   }
 
-  if (prev_mi) {
+  if (prev_mi[0]) {
     context_size = prev_mi[0]->sb_type;
     if (context_size < bsize)
       last_par = 2;
@@ -3497,7 +3435,8 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
-  const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref);
+  YV12_BUFFER_CONFIG *yv12;
+  YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
   const int step_param = 1;
   const MvLimits tmp_mv_limits = x->mv_limits;
   const SEARCH_METHODS search_method = NSTEP;
@@ -3505,13 +3444,23 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 };
   MV best_mv = { 0, 0 };
   int cost_list[5];
+  struct buf_2d backup_pre[MAX_MB_PLANE] = { { 0, 0 } };
+
+  if (scaled_ref_frame) {
+    yv12 = scaled_ref_frame;
+    // As reported in b/311294795, the reference buffer pointer needs to be
+    // saved and restored after the search. Otherwise, it causes problems while
+    // the reference frame scaling happens.
+    for (int i = 0; i < MAX_MB_PLANE; i++) backup_pre[i] = xd->plane[i].pre[0];
+  } else {
+    yv12 = get_ref_frame_buffer(cpi, ref);
+  }
 
   assert(yv12 != NULL);
   if (!yv12) return;
-  vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
-                       &cm->frame_refs[ref - 1].sf);
+  vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, NULL);
   mi->ref_frame[0] = ref;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->sb_type = bsize;
   vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
   vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method,
@@ -3522,6 +3471,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   x->mv_limits = tmp_mv_limits;
   mi->mv[0].as_mv = best_mv;
 
+  // Restore reference buffer pointer.
+  if (scaled_ref_frame) {
+    for (int i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_pre[i];
+  }
+
   set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
   xd->plane[0].dst.buf = pred_buf;
   xd->plane[0].dst.stride = 64;
@@ -3532,15 +3486,15 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
 // Features used: QP; spatial block size contexts; variance of prediction
 // residue after simple_motion_search.
 #define FEATURES 12
-static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
-                                          MACROBLOCK *const x,
-                                          PC_TREE *const pc_tree,
-                                          BLOCK_SIZE bsize, int mi_row,
-                                          int mi_col, int *none, int *split) {
+static void ml_predict_var_rd_partitioning(const VP9_COMP *const cpi,
+                                           MACROBLOCK *const x,
+                                           PC_TREE *const pc_tree,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col, int *none, int *split) {
   const VP9_COMMON *const cm = &cpi->common;
   const NN_CONFIG *nn_config = NULL;
+  const MACROBLOCKD *const xd = &x->e_mbd;
 #if CONFIG_VP9_HIGHBITDEPTH
-  MACROBLOCKD *xd = &x->e_mbd;
   DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]);
   uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
                                 ? (CONVERT_TO_BYTEPTR(pred_buffer))
@@ -3623,7 +3577,6 @@ static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
       const unsigned int var =
           cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
       const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
-      const MACROBLOCKD *const xd = &x->e_mbd;
       const int has_above = !!xd->above_mi;
       const int has_left = !!xd->left_mi;
       const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize;
@@ -3773,7 +3726,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
   int row, col;
 
   int dr = 0;
-  int count = 0;
   double r0, rk, beta;
 
   TplDepFrame *tpl_frame;
@@ -3797,8 +3749,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
 
       intra_cost += this_stats->intra_cost;
       mc_dep_cost += this_stats->mc_dep_cost;
-
-      ++count;
     }
   }
 
@@ -3855,7 +3805,7 @@ static void assign_motion_vector_info(const int block_width_4x4,
       const int col_4x4 = col_start_4x4 + j;
       const int unit_index = row_4x4 * num_unit_cols + col_4x4;
       if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue;
-      if (source_ref_frame[1] == NONE) {
+      if (source_ref_frame[1] == NO_REF_FRAME) {
         assert(source_mv[1]->row == 0 && source_mv[1]->col == 0);
       }
       motion_vector_info[unit_index].ref_frame[0] = source_ref_frame[0];
@@ -4064,11 +4014,6 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   BLOCK_SIZE min_size = x->min_partition_size;
   BLOCK_SIZE max_size = x->max_partition_size;
 
-#if CONFIG_FP_MB_STATS
-  unsigned int src_diff_var = UINT_MAX;
-  int none_complexity = 0;
-#endif
-
   int partition_none_allowed = !force_horz_split && !force_vert_split;
   int partition_horz_allowed =
       !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
@@ -4155,65 +4100,6 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
-                                                  mi_col, bsize);
-  }
-#endif
-
-#if CONFIG_FP_MB_STATS
-  // Decide whether we shall split directly and skip searching NONE by using
-  // the first pass block statistics
-  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split &&
-      partition_none_allowed && src_diff_var > 4 &&
-      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
-    int mb_row = mi_row >> 1;
-    int mb_col = mi_col >> 1;
-    int mb_row_end =
-        VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-    int mb_col_end =
-        VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-    int r, c;
-
-    // compute a complexity measure, basically measure inconsistency of motion
-    // vectors obtained from the first pass in the current block
-    for (r = mb_row; r < mb_row_end; r++) {
-      for (c = mb_col; c < mb_col_end; c++) {
-        const int mb_index = r * cm->mb_cols + c;
-
-        MOTION_DIRECTION this_mv;
-        MOTION_DIRECTION right_mv;
-        MOTION_DIRECTION bottom_mv;
-
-        this_mv =
-            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
-
-        // to its right
-        if (c != mb_col_end - 1) {
-          right_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
-          none_complexity += get_motion_inconsistency(this_mv, right_mv);
-        }
-
-        // to its bottom
-        if (r != mb_row_end - 1) {
-          bottom_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
-          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
-        }
-
-        // do not count its left and top neighbors to avoid double counting
-      }
-    }
-
-    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
-      partition_none_allowed = 0;
-    }
-  }
-#endif
-
   pc_tree->partitioning = PARTITION_NONE;
 
   if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) {
@@ -4222,8 +4108,8 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
         mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&
         mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;
     if (do_rd_ml_partition_var_pruning) {
-      ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
-                                    &partition_none_allowed, &do_split);
+      ml_predict_var_rd_partitioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
+                                     &partition_none_allowed, &do_split);
     } else {
       vp9_zero(pc_tree->mv);
     }
@@ -4291,53 +4177,6 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
             }
           }
         }
-
-#if CONFIG_FP_MB_STATS
-        // Check if every 16x16 first pass block statistics has zero
-        // motion and the corresponding first pass residue is small enough.
-        // If that is the case, check the difference variance between the
-        // current frame and the last frame. If the variance is small enough,
-        // stop further splitting in RD optimization
-        if (cpi->use_fp_mb_stats && do_split != 0 &&
-            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
-          int mb_row = mi_row >> 1;
-          int mb_col = mi_col >> 1;
-          int mb_row_end =
-              VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-          int mb_col_end =
-              VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-          int r, c;
-
-          int skip = 1;
-          for (r = mb_row; r < mb_row_end; r++) {
-            for (c = mb_col; c < mb_col_end; c++) {
-              const int mb_index = r * cm->mb_cols + c;
-              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_MOTION_ZERO_MASK) ||
-                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_ERROR_SMALL_MASK)) {
-                skip = 0;
-                break;
-              }
-            }
-            if (skip == 0) {
-              break;
-            }
-          }
-
-          if (skip) {
-            if (src_diff_var == UINT_MAX) {
-              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-              src_diff_var = get_sby_perpixel_diff_variance(
-                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
-            }
-            if (src_diff_var < 8) {
-              do_split = 0;
-              do_rect = 0;
-            }
-          }
-        }
-#endif
       }
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -4519,9 +4358,9 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
         bsize > BLOCK_8X8) {
-      PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+      PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
+      update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter;
@@ -4596,22 +4435,44 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
+  if (bsize == BLOCK_64X64 && best_rdc.rdcost == INT64_MAX) {
+    vp9_rd_cost_reset(&this_rdc);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, BLOCK_64X64,
+                     ctx, INT_MAX, INT64_MAX);
+    ctx->rdcost = this_rdc.rdcost;
+    vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc);
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      should_encode_sb = 1;
+      pc_tree->partitioning = PARTITION_NONE;
+    }
+  }
+
   *rd_cost = best_rdc;
 
   if (should_encode_sb && pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_sb_time);
+#endif
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_sb_time);
+#endif
 #if CONFIG_RATE_CTRL
-    // Store partition, motion vector of the superblock.
-    if (output_enabled) {
-      const int num_unit_rows = get_num_unit_4x4(cpi->frame_info.frame_height);
-      const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width);
-      store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride,
-                            num_4x4_blocks_wide_lookup[BLOCK_64X64],
-                            num_unit_rows, num_unit_cols, mi_row << 1,
-                            mi_col << 1, cpi->partition_info,
-                            cpi->motion_vector_info);
+    if (oxcf->use_simple_encode_api) {
+      // Store partition, motion vector of the superblock.
+      if (output_enabled) {
+        const int num_unit_rows =
+            get_num_unit_4x4(cpi->frame_info.frame_height);
+        const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width);
+        store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride,
+                              num_4x4_blocks_wide_lookup[BLOCK_64X64],
+                              num_unit_rows, num_unit_cols, mi_row << 1,
+                              mi_col << 1, cpi->partition_info,
+                              cpi->motion_vector_info);
+      }
     }
 #endif  // CONFIG_RATE_CTRL
   }
@@ -4700,13 +4561,6 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rate, &dummy_dist, 1, td->pc_root);
-    } else if (cpi->partition_search_skippable_frame) {
-      BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
-      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
                cm->frame_type != KEY_FRAME) {
       choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
@@ -4732,8 +4586,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
                                 &x->min_partition_size, &x->max_partition_size);
       }
       td->pc_root->none.rdcost = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, rd_pick_partition_time);
+#endif
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rdc, dummy_rdc, td->pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, rd_pick_partition_time);
+#endif
     }
     (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
                                     sb_col_in_tile, num_sb_cols);
@@ -4865,6 +4726,8 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
 
+  x->skip_recode = 0;
+
   mi = xd->mi[0];
   mi->sb_type = bsize;
 
@@ -4886,7 +4749,7 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
                                 mi_col);
   else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
-    set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
+    set_mode_info_seg_skip(x, cm->tx_mode, cm->interp_filter, rd_cost, bsize);
   else if (bsize >= BLOCK_8X8) {
     if (cpi->rc.hybrid_intra_scene_change)
       hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
@@ -4988,9 +4851,9 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
 
 #define FEATURES 6
 #define LABELS 2
-static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x,
-                                      BLOCK_SIZE bsize, int mi_row,
-                                      int mi_col) {
+static int ml_predict_var_partitioning(VP9_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, int mi_row,
+                                       int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   const NN_CONFIG *nn_config = NULL;
 
@@ -5122,7 +4985,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     if (partition_none_allowed || do_split) do_rect = 0;
     if (partition_none_allowed && do_split) {
       const int ml_predicted_partition =
-          ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col);
+          ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
       if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
       if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
     }
@@ -5580,7 +5443,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
 
     assert(yv12 != NULL);
 
-    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+    if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
         cpi->svc.use_gf_temporal_ref_current_layer) {
       // For now, GOLDEN will not be used for non-zero spatial layers, since
       // it may not be a temporal reference.
@@ -5611,7 +5474,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
                            &cm->frame_refs[LAST_FRAME - 1].sf);
       mi->ref_frame[0] = LAST_FRAME;
     }
-    mi->ref_frame[1] = NONE;
+    mi->ref_frame[1] = NO_REF_FRAME;
     mi->sb_type = BLOCK_64X64;
     mi->mv[0].as_int = 0;
     mi->interp_filter = BILINEAR;
@@ -5717,16 +5580,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     x->arf_frame_usage = 0;
     x->lastgolden_frame_usage = 0;
 
-    if (seg->enabled) {
-      const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
-      if (seg_skip) {
-        partition_search_type = FIXED_PARTITION;
-      }
-    }
-
     if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
       int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3);
       int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
@@ -5738,6 +5591,38 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         partition_search_type = REFERENCE_PARTITION;
     }
 
+    if (seg->enabled) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+
+      if (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+          cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY &&
+          x->content_state_sb > kLowSadLowSumdiff) {
+        // For ROI with skip, force segment = 0 (no skip) over whole
+        // superblock to avoid artifacts if temporal change in source_sad is
+        // not 0.
+        int xi, yi;
+        const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+        const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+        const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+        const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+        const int block_index = mi_row * cm->mi_cols + mi_col;
+        set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+        for (yi = 0; yi < ymis; yi++)
+          for (xi = 0; xi < xmis; xi++) {
+            int map_offset = block_index + yi * cm->mi_cols + xi;
+            cpi->segmentation_map[map_offset] = 0;
+          }
+        set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0);
+        seg_skip = 0;
+      }
+      if (seg_skip) {
+        partition_search_type = FIXED_PARTITION;
+      }
+    }
+
     // Set the partition type of the 64X64 block
     switch (partition_search_type) {
       case VAR_BASED_PARTITION:
@@ -5779,7 +5664,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
              cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
-          // Use lower max_partition_size for low resoultions.
+          // Use lower max_partition_size for low resolutions.
           if (cm->width <= 352 && cm->height <= 288)
             x->max_partition_size = BLOCK_32X32;
           else
@@ -5821,12 +5706,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
 }
 // end RTC play code
 
-static INLINE uint32_t variance(const diff *const d) {
+static INLINE uint32_t variance(const Diff *const d) {
   return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint32_t variance_highbd(diff *const d) {
+static INLINE uint32_t variance_highbd(Diff *const d) {
   const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8);
   return (var >= 0) ? (uint32_t)var : 0;
 }
@@ -5846,7 +5731,7 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
                          ? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100)
                          : (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
   DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
-  diff *var16 = cpi->source_diff_var;
+  Diff *var16 = cpi->source_diff_var;
 
   int sum = 0;
   int i, j;
@@ -5929,8 +5814,8 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) {
     if (cm->last_width != cm->width || cm->last_height != cm->height) {
       if (cpi->source_diff_var) vpx_free(cpi->source_diff_var);
 
-      CHECK_MEM_ERROR(cm, cpi->source_diff_var,
-                      vpx_calloc(cm->MBs, sizeof(diff)));
+      CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
+                      vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
     }
 
     if (!cpi->frames_till_next_var_check)
@@ -5969,7 +5854,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
     CHECK_MEM_ERROR(
-        cm, cpi->tile_data,
+        &cm->error, cpi->tile_data,
         vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
     cpi->allocated_tiles = tile_cols * tile_rows;
 
@@ -5978,15 +5863,15 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         TileDataEnc *tile_data =
             &cpi->tile_data[tile_row * tile_cols + tile_col];
         int i, j;
+        const MV zero_mv = { 0, 0 };
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
             tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
             tile_data->mode_map[i][j] = j;
           }
         }
+        tile_data->firstpass_top_mv = zero_mv;
 #if CONFIG_MULTITHREAD
         tile_data->row_base_thresh_freq_fact = NULL;
 #endif
@@ -6028,9 +5913,12 @@ void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row,
   get_start_tok(cpi, tile_row, tile_col, mi_row, &tok);
   cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok;
 
+#if CONFIG_REALTIME_ONLY
+  assert(cpi->sf.use_nonrd_pick_mode);
+  encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#else
   if (cpi->sf.use_nonrd_pick_mode)
     encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
-#if !CONFIG_REALTIME_ONLY
   else
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
 #endif
@@ -6072,20 +5960,6 @@ static void encode_tiles(VP9_COMP *cpi) {
       vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
 }
 
-#if CONFIG_FP_MB_STATS
-static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
-                            VP9_COMMON *cm, uint8_t **this_frame_mb_stats) {
-  uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
-                         cm->current_video_frame * cm->MBs * sizeof(uint8_t);
-
-  if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
-
-  *this_frame_mb_stats = mb_stats_in;
-
-  return 1;
-}
-#endif
-
 static int compare_kmeans_data(const void *a, const void *b) {
   if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) {
     return 1;
@@ -6214,9 +6088,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-#if CONFIG_CONSISTENT_RECODE
   x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
-#endif
   if (xd->lossless) x->optimize = 0;
   x->sharpness = cpi->oxcf.sharpness;
   x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
@@ -6285,6 +6157,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
       cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
   }
 
+  for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+       ++ref_frame) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
+      if (cm->frame_refs[ref_frame - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+          cm->frame_refs[ref_frame - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+        cpi->ref_frame_flags &= ~ref_frame_to_flag(ref_frame);
+    }
+  }
+
   // Frame segmentation
   if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi);
 
@@ -6292,13 +6173,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
 
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
-                       &cpi->twopass.this_frame_mb_stats);
-    }
-#endif
-
     if (!cpi->row_mt) {
       cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
       cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
@@ -6350,7 +6224,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
 
   int mi_row, mi_col;
   int sum_delta = 0;
-  int map_index = 0;
   int qdelta_index;
   int segment_id;
 
@@ -6360,7 +6233,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
       segment_id = mi_8x8[0]->segment_id;
       qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
       sum_delta += qdelta_index;
-      map_index++;
     }
     mi_8x8_ptr += cm->mi_stride;
   }
@@ -6368,13 +6240,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
   return sum_delta / (cm->mi_rows * cm->mi_cols);
 }
 
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 static void restore_encode_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  int tile_col, tile_row;
+  int tile_idx;
   int i, j;
+  TileDataEnc *tile_data;
   RD_OPT *rd_opt = &cpi->rd;
   for (i = 0; i < MAX_REF_FRAMES; i++) {
     for (j = 0; j < REFERENCE_MODES; j++)
@@ -6385,30 +6255,19 @@ static void restore_encode_params(VP9_COMP *cpi) {
       rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
   }
 
-  if (cpi->tile_data != NULL) {
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        for (i = 0; i < BLOCK_SIZES; ++i) {
-          for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact[i][j] =
-                tile_data->thresh_freq_fact_prev[i][j];
-          }
-        }
-      }
+  for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+    assert(cpi->tile_data);
+    tile_data = &cpi->tile_data[tile_idx];
+    vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev);
   }
 
   cm->interp_filter = cpi->sf.default_interp_filter;
 }
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   restore_encode_params(cpi);
-#endif
 
 #if CONFIG_MISMATCH_DEBUG
   mismatch_reset_frame(MAX_MB_PLANE);
@@ -6462,7 +6321,13 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     if (cm->interp_filter == SWITCHABLE)
       cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_frame_internal_time);
+#endif
     encode_frame_internal(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_frame_internal_time);
+#endif
 
     for (i = 0; i < REFERENCE_MODES; ++i)
       mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
@@ -6703,7 +6568,8 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
 
     ++td->counts->tx.tx_totals[mi->tx_size];
     ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
-    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode)
       vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
     if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
         (!cpi->use_svc ||
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 7630a81103..eded9f5c42 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -26,6 +26,7 @@
 #include "vp9/common/vp9_scan.h"
 
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
@@ -78,7 +79,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   const int shift = (tx_size == TX_32X32);
   const int16_t *const dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
+  const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
   const MODE_INFO *mbmi = xd->mi[0];
@@ -350,7 +351,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -366,28 +367,24 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp,
-                                     p->quant_fp, qcoeff, dqcoeff, pd->dequant,
-                                     eob, scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff,
+                                     pd->dequant, eob, scan_order);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->round_fp,
-                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->round_fp,
-                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp,
-                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
     }
     return;
@@ -397,29 +394,25 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp,
-                            p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                            scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp,
-                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                      scan_order->iscan);
+      vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp,
-                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                      scan_order->iscan);
+      vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
 
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
-                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                      scan_order->iscan);
+      vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
       break;
   }
 }
@@ -444,28 +437,24 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                                     p->quant_fp[0], qcoeff, dqcoeff,
-                                     pd->dequant[0], eob);
+        vpx_highbd_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff,
+                                     dqcoeff, pd->dequant[0], eob);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
     }
     return;
@@ -475,24 +464,24 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       vpx_fdct32x32_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round, p->quant_fp[0],
-                            qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
       break;
     case TX_16X16:
       vpx_fdct16x16_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 256, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
     case TX_8X8:
       vpx_fdct8x8_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
   }
 }
@@ -502,7 +491,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -518,32 +507,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, eob, scan_order->scan,
-                                    scan_order->iscan);
+        vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                    scan_order);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
     }
     return;
@@ -553,29 +534,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, eob, scan_order->scan,
-                           scan_order->iscan);
+      vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                           scan_order);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
   }
 }
@@ -771,10 +747,23 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
   MODE_INFO *mi = xd->mi[0];
   int plane;
 #if CONFIG_MISMATCH_DEBUG
-  struct encode_b_args arg = { x,         1,      NULL,   NULL,
+  struct encode_b_args arg = { x,
+                               1,     // enable_trellis_opt
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
+                               NULL,  // above entropy context
+                               NULL,  // left entropy context
                                &mi->skip, mi_row, mi_col, output_enabled };
 #else
-  struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip };
+  struct encode_b_args arg = { x,
+                               1,     // enable_trellis_opt
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
+                               NULL,  // above entropy context
+                               NULL,  // left entropy context
+                               &mi->skip };
   (void)mi_row;
   (void)mi_col;
   (void)output_enabled;
@@ -792,9 +781,9 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
       const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
       vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane],
                                ctx.tl[plane]);
-      arg.enable_coeff_opt = 1;
+      arg.enable_trellis_opt = 1;
     } else {
-      arg.enable_coeff_opt = 0;
+      arg.enable_trellis_opt = 0;
     }
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
@@ -816,7 +805,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const scan_order *scan_order;
+  const ScanOrder *scan_order;
   TX_TYPE tx_type = DCT_DCT;
   PREDICTION_MODE mode;
   const int bwl = b_width_log2_lookup[plane_bsize];
@@ -826,17 +815,13 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   uint16_t *eob = &p->eobs[block];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
+  int enable_trellis_opt = !x->skip_recode;
   ENTROPY_CONTEXT *a = NULL;
   ENTROPY_CONTEXT *l = NULL;
   int entropy_ctx = 0;
   dst = &pd->dst.buf[4 * (row * dst_stride + col)];
   src = &p->src.buf[4 * (row * src_stride + col)];
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
-  if (args->enable_coeff_opt) {
-    a = &args->ta[col];
-    l = &args->tl[row];
-    entropy_ctx = combine_entropy_contexts(*a, *l);
-  }
 
   if (tx_size == TX_4X4) {
     tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
@@ -860,21 +845,42 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   // skip block condition should be handled before this is called.
   assert(!x->skip_block);
 
+  if (!x->skip_recode) {
+    const int tx_size_in_pixels = (1 << tx_size) << 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                                diff_stride, src, src_stride, dst, dst_stride,
+                                xd->bd);
+    } else {
+      vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                         diff_stride, src, src_stride, dst, dst_stride);
+    }
+#else
+    vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                       diff_stride, src, src_stride, dst, dst_stride);
+#endif
+    enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col,
+                                        plane_bsize, tx_size, args);
+  }
+
+  if (enable_trellis_opt) {
+    a = &args->ta[col];
+    l = &args->tl[row];
+    entropy_ctx = combine_entropy_contexts(*a, *l);
+  }
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     switch (tx_size) {
       case TX_32X32:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                      p->round, p->quant, p->quant_shift,
-                                      qcoeff, dqcoeff, pd->dequant, eob,
-                                      scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
+                                      eob, scan_order);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -883,18 +889,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         break;
       case TX_16X16:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type == DCT_DCT)
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -904,18 +906,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         break;
       case TX_8X8:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type == DCT_DCT)
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                scan_order);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -926,18 +924,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       default:
         assert(tx_size == TX_4X4);
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type != DCT_DCT)
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                scan_order);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -961,15 +955,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       if (!x->skip_recode) {
-        vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff, dqcoeff,
-                             pd->dequant, eob, scan_order->scan,
-                             scan_order->iscan);
+        vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                             scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -977,14 +967,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       break;
     case TX_16X16:
       if (!x->skip_recode) {
-        vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -992,14 +979,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       break;
     case TX_8X8:
       if (!x->skip_recode) {
-        vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -1008,17 +992,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     default:
       assert(tx_size == TX_4X4);
       if (!x->skip_recode) {
-        vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
           x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob) {
@@ -1036,28 +1017,43 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
 }
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b) {
+                                  int enable_trellis_opt) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
 #if CONFIG_MISMATCH_DEBUG
   // TODO(angiebird): make mismatch_debug support intra mode
   struct encode_b_args arg = {
-    x, enable_optimize_b, ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip, 0, 0,
-    0
+    x,
+    enable_trellis_opt,
+    0.0,   // trellis_opt_thresh
+    NULL,  // &sse_calc_done
+    NULL,  // &sse
+    ctx.ta[plane],
+    ctx.tl[plane],
+    &xd->mi[0]->skip,
+    0,  // mi_row
+    0,  // mi_col
+    0   // output_enabled
   };
 #else
-  struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane],
-                               ctx.tl[plane], &xd->mi[0]->skip };
+  struct encode_b_args arg = { x,
+                               enable_trellis_opt,
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
+                               ctx.ta[plane],
+                               ctx.tl[plane],
+                               &xd->mi[0]->skip };
 #endif
 
-  if (enable_optimize_b && x->optimize &&
+  if (enable_trellis_opt && x->optimize &&
       (!x->skip_recode || !x->skip_optimize)) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const TX_SIZE tx_size =
         plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size;
     vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
   } else {
-    arg.enable_coeff_opt = 0;
+    arg.enable_trellis_opt = 0;
   }
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 1975ee73ac..1391446bed 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -20,7 +20,10 @@ extern "C" {
 
 struct encode_b_args {
   MACROBLOCK *x;
-  int enable_coeff_opt;
+  int enable_trellis_opt;
+  double trellis_opt_thresh;
+  int *sse_calc_done;
+  int64_t *sse;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
   int8_t *skip;
@@ -48,7 +51,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b);
+                                  int enable_trellis_opt);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 20e8d494a5..fd213f1e6b 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -12,37 +12,40 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/psnr.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "vpx_util/vpx_debug_util.h"
 #endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_idct.h"
-#if CONFIG_NON_GREEDY_MV
-#include "vp9/common/vp9_mvref_common.h"
-#endif
 #if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_tile_common.h"
-#include "vp9/common/vp9_scan.h"
 
 #if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_alt_ref_aq.h"
@@ -80,6 +83,7 @@
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_tpl_model.h"
 #include "vp9/vp9_cx_iface.h"
 
 #define AM_SEGMENT_ID_INACTIVE 7
@@ -125,13 +129,6 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
 }
 #endif
 
-#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                         TX_SIZE tx_size);
-#endif
-void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                  TX_SIZE tx_size);
-
 #if !CONFIG_REALTIME_ONLY
 // compute adaptive threshold for skip recoding
 static int compute_context_model_thresh(const VP9_COMP *const cpi) {
@@ -147,7 +144,7 @@ static int compute_context_model_thresh(const VP9_COMP *const cpi) {
   // frame context probability model is less than a certain threshold.
   // The first component is the most critical part to guarantee adaptivity.
   // Other parameters are estimated based on normal setting of hd resolution
-  // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
+  // parameters. e.g. frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
   const int thresh =
       ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) *
        qindex_factor) >>
@@ -501,22 +498,22 @@ static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
   "Too many reference buffers are used."
 };
 
-static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
+static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) {
   switch (mode) {
-    case NORMAL:
+    case VP8E_NORMAL:
       *hr = 1;
       *hs = 1;
       break;
-    case FOURFIVE:
+    case VP8E_FOURFIVE:
       *hr = 4;
       *hs = 5;
       break;
-    case THREEFIVE:
+    case VP8E_THREEFIVE:
       *hr = 3;
       *hs = 5;
       break;
     default:
-      assert(mode == ONETWO);
+      assert(mode == VP8E_ONETWO);
       *hr = 1;
       *hs = 2;
       break;
@@ -585,8 +582,6 @@ static void apply_roi_map(VP9_COMP *cpi) {
   int ref_frame[8];
   int internal_delta_q[MAX_SEGMENTS];
   int i;
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
 
   // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
   // realtime mode.
@@ -618,7 +613,7 @@ static void apply_roi_map(VP9_COMP *cpi) {
     }
     if (skip[i] != 0) {
       vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
-      vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]);
+      vp9_set_segdata(seg, i, SEG_LVL_SKIP, 0);
     }
     if (ref_frame[i] >= 0) {
       int valid_ref = 1;
@@ -627,7 +622,7 @@ static void apply_roi_map(VP9_COMP *cpi) {
         valid_ref = 0;
       // If GOLDEN is selected, make sure it's set as reference.
       if (ref_frame[i] == GOLDEN_FRAME &&
-          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
+          !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) {
         valid_ref = 0;
       }
       // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
@@ -654,10 +649,15 @@ static void init_level_info(Vp9LevelInfo *level_info) {
 }
 
 static int check_seg_range(int seg_data[8], int range) {
-  return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
-           abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
-           abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
-           abs(seg_data[6]) > range || abs(seg_data[7]) > range);
+  int i;
+  for (i = 0; i < 8; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (seg_data[i] > range || seg_data[i] < -range) {
+      return 0;
+    }
+  }
+  return 1;
 }
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
@@ -686,9 +686,10 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
   return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }
 
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]) {
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]) {
   VP9_COMMON *cm = &cpi->common;
   vpx_roi_map_t *roi = &cpi->roi;
   const int range = 63;
@@ -699,13 +700,13 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
 
   // Check number of rows and columns match
   if (frame_rows != (int)rows || frame_cols != (int)cols) {
-    return -1;
+    return VPX_CODEC_INVALID_PARAM;
   }
 
   if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
       !check_seg_range(ref_frame, ref_frame_range) ||
       !check_seg_range(skip, skip_range))
-    return -1;
+    return VPX_CODEC_INVALID_PARAM;
 
   // Also disable segmentation if no deltas are specified.
   if (!map ||
@@ -719,14 +720,15 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
         ref_frame[6] == -1 && ref_frame[7] == -1))) {
     vp9_disable_segmentation(&cm->seg);
     cpi->roi.enabled = 0;
-    return 0;
+    return VPX_CODEC_OK;
   }
 
   if (roi->roi_map) {
     vpx_free(roi->roi_map);
     roi->roi_map = NULL;
   }
-  CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+  roi->roi_map = vpx_malloc(rows * cols);
+  if (!roi->roi_map) return VPX_CODEC_MEM_ERROR;
 
   // Copy to ROI structure in the compressor.
   memcpy(roi->roi_map, map, rows * cols);
@@ -738,7 +740,7 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
   roi->rows = rows;
   roi->cols = cols;
 
-  return 0;
+  return VPX_CODEC_OK;
 }
 
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
@@ -882,10 +884,11 @@ static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
   if (!cm->prev_mip) return 1;
   cm->mi_alloc_size = mi_size;
 
-  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+  cm->mi_grid_base =
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
   if (!cm->mi_grid_base) return 1;
   cm->prev_mi_grid_base =
-      (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
   if (!cm->prev_mi_grid_base) return 1;
 
   return 0;
@@ -924,24 +927,21 @@ static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
 }
 
-void vp9_initialize_enc(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    vp9_rtcd();
-    vpx_dsp_rtcd();
-    vpx_scale_rtcd();
-    vp9_init_intra_predictors();
-    vp9_init_me_luts();
-    vp9_rc_init_minq_luts();
-    vp9_entropy_mv_init();
+static void initialize_enc(void) {
+  vp9_rtcd();
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+  vp9_init_intra_predictors();
+  vp9_init_me_luts();
+  vp9_rc_init_minq_luts();
+  vp9_entropy_mv_init();
 #if !CONFIG_REALTIME_ONLY
-    vp9_temporal_filter_init();
+  vp9_temporal_filter_init();
 #endif
-    init_done = 1;
-  }
 }
 
+void vp9_initialize_enc(void) { once(initialize_enc); }
+
 static void dealloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int i;
@@ -1022,8 +1022,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   cpi->mi_ssim_rdmult_scaling_factors = NULL;
 
 #if CONFIG_RATE_CTRL
-  free_partition_info(cpi);
-  free_motion_vector_info(cpi);
+  if (cpi->oxcf.use_simple_encode_api) {
+    free_partition_info(cpi);
+    free_motion_vector_info(cpi);
+    free_fp_motion_vector_info(cpi);
+    free_tpl_stats_info(cpi);
+  }
 #endif
 
   vp9_free_ref_frame_buffers(cm->buffer_pool);
@@ -1328,7 +1332,7 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
   // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
   // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
   // target of 1/4x1/4. number_spatial_layers must be greater than 2.
-  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+  if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
       cpi->svc.number_spatial_layers > 2) {
     cpi->svc.scaled_temp_is_alloc = 1;
     if (vpx_realloc_frame_buffer(
@@ -1374,21 +1378,22 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 #endif
 }
 
-static int alloc_context_buffers_ext(VP9_COMP *cpi) {
+static void alloc_context_buffers_ext(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int mi_size = cm->mi_cols * cm->mi_rows;
 
-  cpi->mbmi_ext_base = vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
-  if (!cpi->mbmi_ext_base) return 1;
-
-  return 0;
+  CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base,
+                  vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
 static void alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int sb_rows;
 
-  vp9_alloc_context_buffers(cm, cm->width, cm->height);
+  if (vp9_alloc_context_buffers(cm, cm->width, cm->height)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
 
   alloc_context_buffers_ext(cpi);
 
@@ -1396,14 +1401,14 @@ static void alloc_compressor_data(VP9_COMP *cpi) {
 
   {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
-    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+    CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0],
                     vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
 
   sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   vpx_free(cpi->tplist[0][0]);
   CHECK_MEM_ERROR(
-      cm, cpi->tplist[0][0],
+      &cm->error, cpi->tplist[0][0],
       vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
 
   vp9_setup_pc_tree(&cpi->common, &cpi->td);
@@ -1505,7 +1510,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   // Temporal scalability.
   cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
 
-  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+  if ((cpi->svc.number_temporal_layers > 1) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
        cpi->oxcf.pass != 1)) {
@@ -1521,6 +1526,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   init_buffer_indices(cpi);
 
   vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+  cpi->fixed_qp_onepass = 0;
 }
 
 void vp9_check_reset_rc_flag(VP9_COMP *cpi) {
@@ -1564,15 +1570,15 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-// TODO(angiebird): make sdx8f available for highbitdepth if needed
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                             \
-  cpi->fn_ptr[BT].sdaf = SDAF;                           \
-  cpi->fn_ptr[BT].vf = VF;                               \
-  cpi->fn_ptr[BT].svf = SVF;                             \
-  cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                       \
-  cpi->fn_ptr[BT].sdx8f = NULL;
+#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                            \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                          \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                          \
+  cpi->fn_ptr[BT].vf = VF;                                              \
+  cpi->fn_ptr[BT].svf = SVF;                                            \
+  cpi->fn_ptr[BT].svaf = SVAF;                                          \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                      \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -1632,284 +1638,361 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
   }
 
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d)
 
 static void highbd_set_var_fns(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case VPX_BITS_8:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
-                   vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
-                   vpx_highbd_8_sub_pixel_variance32x16,
-                   vpx_highbd_8_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
-                   vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
-                   vpx_highbd_8_sub_pixel_variance16x32,
-                   vpx_highbd_8_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
-                   vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
-                   vpx_highbd_8_sub_pixel_variance64x32,
-                   vpx_highbd_8_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
-                   vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
-                   vpx_highbd_8_sub_pixel_variance32x64,
-                   vpx_highbd_8_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
-                   vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
-                   vpx_highbd_8_sub_pixel_variance32x32,
-                   vpx_highbd_8_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
-                   vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
-                   vpx_highbd_8_sub_pixel_variance64x64,
-                   vpx_highbd_8_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
-                   vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
-                   vpx_highbd_8_sub_pixel_variance16x16,
-                   vpx_highbd_8_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8,
-                   vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8,
-                   vpx_highbd_8_sub_pixel_variance16x8,
-                   vpx_highbd_8_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8,
-                   vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16,
-                   vpx_highbd_8_sub_pixel_variance8x16,
-                   vpx_highbd_8_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits8,
+            vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8,
+            vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16,
+            vpx_highbd_8_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits8,
+            vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8,
+            vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32,
+            vpx_highbd_8_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits8,
+            vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8,
+            vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32,
+            vpx_highbd_8_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits8,
+            vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8,
+            vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64,
+            vpx_highbd_8_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
-            vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
-            vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8)
+            BLOCK_32X32, vpx_highbd_sad32x32_bits8,
+            vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8,
+            vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32,
+            vpx_highbd_8_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
-            vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
-            vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8)
+            BLOCK_64X64, vpx_highbd_sad64x64_bits8,
+            vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8,
+            vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64,
+            vpx_highbd_8_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
-            vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
-            vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8)
+            BLOCK_16X16, vpx_highbd_sad16x16_bits8,
+            vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8,
+            vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16,
+            vpx_highbd_8_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
-            vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
-            vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8)
+            BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+            vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
+            vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
+            vpx_highbd_8_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+            vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
+            vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
+            vpx_highbd_8_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8,
+                   vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
+                   vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
+                   vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+                   vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
+                   vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
+                   vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+                   vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
+                   vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8,
+                   vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
+                   vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8)
         break;
 
       case VPX_BITS_10:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
-                   vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
-                   vpx_highbd_10_sub_pixel_variance32x16,
-                   vpx_highbd_10_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
-                   vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
-                   vpx_highbd_10_sub_pixel_variance16x32,
-                   vpx_highbd_10_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
-                   vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
-                   vpx_highbd_10_sub_pixel_variance64x32,
-                   vpx_highbd_10_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
-                   vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
-                   vpx_highbd_10_sub_pixel_variance32x64,
-                   vpx_highbd_10_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
-                   vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
-                   vpx_highbd_10_sub_pixel_variance32x32,
-                   vpx_highbd_10_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
-                   vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
-                   vpx_highbd_10_sub_pixel_variance64x64,
-                   vpx_highbd_10_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
-                   vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
-                   vpx_highbd_10_sub_pixel_variance16x16,
-                   vpx_highbd_10_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
-                   vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
-                   vpx_highbd_10_sub_pixel_variance16x8,
-                   vpx_highbd_10_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
-                   vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
-                   vpx_highbd_10_sub_pixel_variance8x16,
-                   vpx_highbd_10_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10,
-                   vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
-                   vpx_highbd_10_sub_pixel_variance8x8,
-                   vpx_highbd_10_sub_pixel_avg_variance8x8,
-                   vpx_highbd_sad8x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
-                   vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
-                   vpx_highbd_10_sub_pixel_variance8x4,
-                   vpx_highbd_10_sub_pixel_avg_variance8x4,
-                   vpx_highbd_sad8x4x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
-                   vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
-                   vpx_highbd_10_sub_pixel_variance4x8,
-                   vpx_highbd_10_sub_pixel_avg_variance4x8,
-                   vpx_highbd_sad4x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10,
-                   vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
-                   vpx_highbd_10_sub_pixel_variance4x4,
-                   vpx_highbd_10_sub_pixel_avg_variance4x4,
-                   vpx_highbd_sad4x4x4d_bits10)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits10,
+            vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10,
+            vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16,
+            vpx_highbd_10_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits10,
+            vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10,
+            vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32,
+            vpx_highbd_10_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits10,
+            vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10,
+            vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32,
+            vpx_highbd_10_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits10,
+            vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10,
+            vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64,
+            vpx_highbd_10_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits10,
+            vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10,
+            vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32,
+            vpx_highbd_10_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits10,
+            vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10,
+            vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64,
+            vpx_highbd_10_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits10,
+            vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10,
+            vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16,
+            vpx_highbd_10_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits10,
+            vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10,
+            vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8,
+            vpx_highbd_10_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits10,
+            vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10,
+            vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16,
+            vpx_highbd_10_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10,
+            vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+            vpx_highbd_10_sub_pixel_variance8x8,
+            vpx_highbd_10_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10,
+            vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
+            vpx_highbd_10_sub_pixel_variance8x4,
+            vpx_highbd_10_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10,
+            vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
+            vpx_highbd_10_sub_pixel_variance4x8,
+            vpx_highbd_10_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10,
+            vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+            vpx_highbd_10_sub_pixel_variance4x4,
+            vpx_highbd_10_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10)
         break;
 
       default:
         assert(cm->bit_depth == VPX_BITS_12);
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
-                   vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
-                   vpx_highbd_12_sub_pixel_variance32x16,
-                   vpx_highbd_12_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
-                   vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
-                   vpx_highbd_12_sub_pixel_variance16x32,
-                   vpx_highbd_12_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
-                   vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
-                   vpx_highbd_12_sub_pixel_variance64x32,
-                   vpx_highbd_12_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
-                   vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
-                   vpx_highbd_12_sub_pixel_variance32x64,
-                   vpx_highbd_12_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
-                   vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
-                   vpx_highbd_12_sub_pixel_variance32x32,
-                   vpx_highbd_12_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
-                   vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
-                   vpx_highbd_12_sub_pixel_variance64x64,
-                   vpx_highbd_12_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
-                   vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
-                   vpx_highbd_12_sub_pixel_variance16x16,
-                   vpx_highbd_12_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
-                   vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
-                   vpx_highbd_12_sub_pixel_variance16x8,
-                   vpx_highbd_12_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
-                   vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
-                   vpx_highbd_12_sub_pixel_variance8x16,
-                   vpx_highbd_12_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12,
-                   vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
-                   vpx_highbd_12_sub_pixel_variance8x8,
-                   vpx_highbd_12_sub_pixel_avg_variance8x8,
-                   vpx_highbd_sad8x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
-                   vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
-                   vpx_highbd_12_sub_pixel_variance8x4,
-                   vpx_highbd_12_sub_pixel_avg_variance8x4,
-                   vpx_highbd_sad8x4x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
-                   vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
-                   vpx_highbd_12_sub_pixel_variance4x8,
-                   vpx_highbd_12_sub_pixel_avg_variance4x8,
-                   vpx_highbd_sad4x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12,
-                   vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
-                   vpx_highbd_12_sub_pixel_variance4x4,
-                   vpx_highbd_12_sub_pixel_avg_variance4x4,
-                   vpx_highbd_sad4x4x4d_bits12)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits12,
+            vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12,
+            vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16,
+            vpx_highbd_12_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits12,
+            vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12,
+            vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32,
+            vpx_highbd_12_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits12,
+            vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12,
+            vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32,
+            vpx_highbd_12_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits12,
+            vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12,
+            vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64,
+            vpx_highbd_12_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits12,
+            vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12,
+            vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32,
+            vpx_highbd_12_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits12,
+            vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12,
+            vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64,
+            vpx_highbd_12_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits12,
+            vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12,
+            vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16,
+            vpx_highbd_12_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits12,
+            vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12,
+            vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8,
+            vpx_highbd_12_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits12,
+            vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12,
+            vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16,
+            vpx_highbd_12_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12,
+            vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+            vpx_highbd_12_sub_pixel_variance8x8,
+            vpx_highbd_12_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12,
+            vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
+            vpx_highbd_12_sub_pixel_variance8x4,
+            vpx_highbd_12_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12,
+            vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
+            vpx_highbd_12_sub_pixel_variance4x8,
+            vpx_highbd_12_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12,
+            vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+            vpx_highbd_12_sub_pixel_variance4x4,
+            vpx_highbd_12_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12)
         break;
     }
   }
@@ -1921,53 +2004,64 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) {
 
   // Create the encoder segmentation map and set all entries to 0
   vpx_free(cpi->segmentation_map);
-  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+  CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // Create a map used for cyclic background refresh.
   if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh);
-  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+  CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh,
                   vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
 
   // Create a map used to mark inactive areas.
   vpx_free(cpi->active_map.map);
-  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+  CHECK_MEM_ERROR(&cm->error, cpi->active_map.map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+  CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
 static void alloc_copy_partition_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cpi->prev_partition == NULL) {
-    CHECK_MEM_ERROR(cm, cpi->prev_partition,
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_partition,
                     (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
                                              sizeof(*cpi->prev_partition)));
   }
   if (cpi->prev_segment_id == NULL) {
     CHECK_MEM_ERROR(
-        cm, cpi->prev_segment_id,
+        &cm->error, cpi->prev_segment_id,
         (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                              sizeof(*cpi->prev_segment_id)));
   }
   if (cpi->prev_variance_low == NULL) {
-    CHECK_MEM_ERROR(cm, cpi->prev_variance_low,
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low,
                     (uint8_t *)vpx_calloc(
                         (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25,
                         sizeof(*cpi->prev_variance_low)));
   }
   if (cpi->copied_frame_cnt == NULL) {
     CHECK_MEM_ERROR(
-        cm, cpi->copied_frame_cnt,
+        &cm->error, cpi->copied_frame_cnt,
         (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                               sizeof(*cpi->copied_frame_cnt)));
   }
 }
 
+static void free_copy_partition_data(VP9_COMP *cpi) {
+  vpx_free(cpi->prev_partition);
+  cpi->prev_partition = NULL;
+  vpx_free(cpi->prev_segment_id);
+  cpi->prev_segment_id = NULL;
+  vpx_free(cpi->prev_variance_low);
+  cpi->prev_variance_low = NULL;
+  vpx_free(cpi->copied_frame_cnt);
+  cpi->copied_frame_cnt = NULL;
+}
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2047,13 +2141,18 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
     if (cm->mi_alloc_size < new_mi_size) {
       vp9_free_context_buffers(cm);
+      vp9_free_pc_tree(&cpi->td);
+      vpx_free(cpi->mbmi_ext_base);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
       cpi->external_resize = 0;
     } else if (cm->mi_alloc_size == new_mi_size &&
                (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
-      vp9_alloc_loop_filter(cm);
+      if (vp9_alloc_loop_filter(cm)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate loop filter data");
+      }
     }
   }
 
@@ -2062,15 +2161,25 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     update_frame_size(cpi);
 
   if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
-    memset(cpi->consec_zero_mv, 0,
-           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
+    vpx_free(cpi->consec_zero_mv);
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->consec_zero_mv,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
+
+    vpx_free(cpi->skin_map);
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->skin_map,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
+
+    free_copy_partition_data(cpi);
+    alloc_copy_partition_data(cpi);
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_reset_resize(cpi);
     rc->rc_1_frame = 0;
     rc->rc_2_frame = 0;
   }
 
-  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+  if ((cpi->svc.number_temporal_layers > 1) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
        cpi->oxcf.pass != 1)) {
@@ -2101,18 +2210,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   vp9_set_row_mt(cpi);
 }
 
-#ifndef M_LOG2_E
-#define M_LOG2_E 0.693147180559945309417
-#endif
-#define log2f(x) (log(x) / (float)M_LOG2_E)
-
 /***********************************************************************
  * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts'    *
  ***********************************************************************
  * The following 2 functions ('cal_nmvjointsadcost' and                *
  * 'cal_nmvsadcosts') are used to calculate cost lookup tables         *
  * used by 'vp9_diamond_search_sad'. The C implementation of the       *
- * function is generic, but the AVX intrinsics optimised version       *
+ * function is generic, but the NEON intrinsics optimised version      *
  * relies on the following properties of the computed tables:          *
  * For cal_nmvjointsadcost:                                            *
  *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]     *
@@ -2121,7 +2225,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
  *         (Equal costs for both components)                           *
  *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                  *
  *         (Cost function is even)                                     *
- * If these do not hold, then the AVX optimised version of the         *
+ * If these do not hold, then the NEON optimised version of the        *
  * 'vp9_diamond_search_sad' function cannot be used as it is, in which *
  * case you can revert to using the C function instead.                *
  ***********************************************************************/
@@ -2214,6 +2318,7 @@ static INLINE void vpx_img_chroma_subsampling(vpx_img_fmt_t fmt,
   switch (fmt) {
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_NV12:
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I42216: *subsampling_x = 1; break;
@@ -2224,6 +2329,7 @@ static INLINE void vpx_img_chroma_subsampling(vpx_img_fmt_t fmt,
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_I440:
     case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_NV12:
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I44016: *subsampling_y = 1; break;
     default: *subsampling_y = 0; break;
@@ -2277,7 +2383,7 @@ void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt) {
 VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
-  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(*cpi));
   VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
   if (!cm) return NULL;
@@ -2295,11 +2401,13 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   cm->free_mi = vp9_enc_free_mi;
   cm->setup_mi = vp9_enc_setup_mi;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
+  cpi->compute_frame_low_motion_onepass = 1;
   cpi->use_svc = 0;
   cpi->resize_state = ORIG;
   cpi->external_resize = 0;
@@ -2315,60 +2423,49 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   cpi->frame_info = vp9_get_frame_info(oxcf);
 
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+  vp9_init_rd_parameters(cpi);
 
   init_frame_indexes(cm);
-  cpi->partition_search_skippable_frame = 0;
   cpi->tile_data = NULL;
 
   realloc_segmentation_maps(cpi);
 
   CHECK_MEM_ERROR(
-      cm, cpi->skin_map,
-      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+      &cm->error, cpi->skin_map,
+      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
 
 #if !CONFIG_REALTIME_ONLY
-  CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+  CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
 #endif
 
   CHECK_MEM_ERROR(
-      cm, cpi->consec_zero_mv,
+      &cm->error, cpi->consec_zero_mv,
       vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
 
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
     CHECK_MEM_ERROR(
-        cm, cpi->mbgraph_stats[i].mb_stats,
+        &cm->error, cpi->mbgraph_stats[i].mb_stats,
         vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
-#if CONFIG_FP_MB_STATS
-  cpi->use_fp_mb_stats = 0;
-  if (cpi->use_fp_mb_stats) {
-    // a place holder used to store the first pass mb stats in the first pass
-    CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
-                    vpx_calloc(cm->MBs * sizeof(uint8_t), 1));
-  } else {
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
-
   cpi->refresh_alt_ref_frame = 0;
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 
@@ -2409,7 +2506,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   }
 
   if (cpi->b_calculate_consistency) {
-    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+    CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars,
                     vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
                                sizeof(*cpi->ssim_vars) * 4));
     cpi->worst_consistency = 100.0;
@@ -2462,6 +2559,13 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
 
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
+  {
+    vpx_codec_err_t codec_status = vp9_extrc_init(&cpi->ext_ratectrl);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status, "vp9_extrc_init() failed");
+    }
+  }
+
 #if !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 1) {
     vp9_init_first_pass(cpi);
@@ -2473,11 +2577,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
         cpi->svc.number_temporal_layers > 1) {
       FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
       FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 };
-      int i;
+      int n;
 
-      for (i = 0; i < oxcf->ss_number_layers; ++i) {
+      for (n = 0; n < oxcf->ss_number_layers; ++n) {
         FIRSTPASS_STATS *const last_packet_for_layer =
-            &stats[packets - oxcf->ss_number_layers + i];
+            &stats[packets - oxcf->ss_number_layers + n];
         const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
         const int packets_in_layer = (int)last_packet_for_layer->count + 1;
         if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
@@ -2487,7 +2591,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
           vpx_free(lc->rc_twopass_stats_in.buf);
 
           lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
-          CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+          CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf,
                           vpx_malloc(lc->rc_twopass_stats_in.sz));
           lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
           lc->twopass.stats_in = lc->twopass.stats_in_start;
@@ -2502,11 +2606,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
         }
       }
 
-      for (i = 0; i < packets; ++i) {
-        const int layer_id = (int)stats[i].spatial_layer_id;
+      for (n = 0; n < packets; ++n) {
+        const int layer_id = (int)stats[n].spatial_layer_id;
         if (layer_id >= 0 && layer_id < oxcf->ss_number_layers &&
             stats_copy[layer_id] != NULL) {
-          *stats_copy[layer_id] = stats[i];
+          *stats_copy[layer_id] = stats[n];
           ++stats_copy[layer_id];
         }
       }
@@ -2514,18 +2618,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
       vp9_init_second_pass_spatial_svc(cpi);
     } else {
       int num_frames;
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        const size_t psz = cpi->common.MBs * sizeof(uint8_t);
-        const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
-
-        cpi->twopass.firstpass_mb_stats.mb_stats_start =
-            oxcf->firstpass_mb_stats_in.buf;
-        cpi->twopass.firstpass_mb_stats.mb_stats_end =
-            cpi->twopass.firstpass_mb_stats.mb_stats_start +
-            (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
-      }
-#endif
 
       cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
       cpi->twopass.stats_in = cpi->twopass.stats_in_start;
@@ -2554,7 +2646,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
     const int h = num_8x8_blocks_high_lookup[bsize];
     const int num_cols = (cm->mi_cols + w - 1) / w;
     const int num_rows = (cm->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors,
+    CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors,
                     vpx_calloc(num_rows * num_cols,
                                sizeof(*cpi->mi_ssim_rdmult_scaling_factors)));
   }
@@ -2563,73 +2655,76 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
 #if CONFIG_NON_GREEDY_MV
   cpi->tpl_ready = 0;
 #endif  // CONFIG_NON_GREEDY_MV
-  for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+  for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) {
+    cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+  }
 
   // Allocate memory to store variances for a frame.
-  CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
+  CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
+                  vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX8F) \
-  cpi->fn_ptr[BT].sdf = SDF;                             \
-  cpi->fn_ptr[BT].sdaf = SDAF;                           \
-  cpi->fn_ptr[BT].vf = VF;                               \
-  cpi->fn_ptr[BT].svf = SVF;                             \
-  cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                       \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;
-
-  // TODO(angiebird): make sdx8f available for every block size
-  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
-      vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
-      vpx_sad32x16x4d, NULL)
-
-  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
-      vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
-      vpx_sad16x32x4d, NULL)
-
-  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
-      vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
-      vpx_sad64x32x4d, NULL)
-
-  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
-      vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
-      vpx_sad32x64x4d, NULL)
-
-  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
-      vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x4d, vpx_sad32x32x8)
-
-  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
-      vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x4d, NULL)
-
-  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
-      vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x4d, vpx_sad16x16x8)
-
-  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
-      vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
-      vpx_sad16x8x4d, vpx_sad16x8x8)
-
-  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
-      vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
-      vpx_sad8x16x4d, vpx_sad8x16x8)
-
-  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
+#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                     \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                   \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                   \
+  cpi->fn_ptr[BT].vf = VF;                                       \
+  cpi->fn_ptr[BT].svf = SVF;                                     \
+  cpi->fn_ptr[BT].svaf = SVAF;                                   \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                               \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg,
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d)
+
+  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg,
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d)
+
+  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg,
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d)
+
+  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg,
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d)
+
+  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d)
+
+  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d)
+
+  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d)
+
+  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d)
+
+  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d)
+
+  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8,
       vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
-      vpx_sad8x8x8)
+      vpx_sad_skip_8x8x4d)
 
-  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
+  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4,
       vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
-      NULL)
+      vpx_sad_skip_8x4x4d)
 
-  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
+  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8,
       vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
-      NULL)
+      vpx_sad_skip_4x8x4d)
 
-  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
+  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4,
       vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
-      vpx_sad4x4x8)
+      vpx_sad_skip_4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
@@ -2659,8 +2754,12 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
 
 #if CONFIG_RATE_CTRL
   encode_command_init(&cpi->encode_command);
-  partition_info_init(cpi);
-  motion_vector_info_init(cpi);
+  if (oxcf->use_simple_encode_api) {
+    partition_info_init(cpi);
+    motion_vector_info_init(cpi);
+    fp_motion_vector_info_init(cpi);
+    tpl_stats_info_init(cpi);
+  }
 #endif
 
   return cpi;
@@ -2673,12 +2772,9 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
 #endif  // CONFIG_INTERNAL_STATS
 
-static void free_tpl_buffer(VP9_COMP *cpi);
-
 void vp9_remove_compressor(VP9_COMP *cpi) {
   VP9_COMMON *cm;
   unsigned int i;
-  int t;
 
   if (!cpi) return;
 
@@ -2769,7 +2865,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 #if 0
     {
       printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+      printf("\n_frames receive_data encod_mb_row compress_frame  Total\n");
       printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
              cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
              cpi->time_compress_data / 1000,
@@ -2789,30 +2885,12 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
     vpx_free(cpi->kmeans_data_arr);
   }
 
-  free_tpl_buffer(cpi);
-
-  for (t = 0; t < cpi->num_workers; ++t) {
-    VPxWorker *const worker = &cpi->workers[t];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+  vp9_free_tpl_buffer(cpi);
 
-    // Deallocate allocated threads.
-    vpx_get_worker_interface()->end(worker);
-
-    // Deallocate allocated thread data.
-    if (t < cpi->num_workers - 1) {
-      vpx_free(thread_data->td->counts);
-      vp9_free_pc_tree(thread_data->td);
-      vpx_free(thread_data->td);
-    }
-  }
-  vpx_free(cpi->tile_thr_data);
-  vpx_free(cpi->workers);
+  vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
   vp9_row_mt_mem_dealloc(cpi);
-
-  if (cpi->num_workers > 1) {
-    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
-    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
-  }
+  vp9_encode_free_mt_data(cpi);
 
 #if !CONFIG_REALTIME_ONLY
   vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
@@ -2825,12 +2903,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
     vpx_free(cpi->mbgraph_stats[i].mb_stats);
   }
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vpx_free(cpi->twopass.frame_mb_stats_buf);
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
+  vp9_extrc_delete(&cpi->ext_ratectrl);
+
+  // Help detect use after free of the error detail string.
+  memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1);
+  cm->error.detail[sizeof(cm->error.detail) - 1] = '\0';
 
   vp9_remove_common(cm);
   vp9_free_ref_frame_buffers(cm->buffer_pool);
@@ -2901,7 +2978,7 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) {
 
 static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
     VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) {
-  MV_REFERENCE_FRAME ref_frame = NONE;
+  MV_REFERENCE_FRAME ref_frame = NO_REF_FRAME;
   if (ref_frame_flag == VP9_LAST_FLAG)
     ref_frame = LAST_FRAME;
   else if (ref_frame_flag == VP9_GOLD_FLAG)
@@ -2909,7 +2986,8 @@ static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
   else if (ref_frame_flag == VP9_ALT_FLAG)
     ref_frame = ALTREF_FRAME;
 
-  return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+  return ref_frame == NO_REF_FRAME ? NULL
+                                   : get_ref_frame_buffer(cpi, ref_frame);
 }
 
 int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
@@ -3002,12 +3080,11 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
 #endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst,
-                                                int bd) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst, int bd) {
 #else
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
@@ -3052,6 +3129,23 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
+
+  // The issue b/311394513 reveals a corner case bug.
+  // For bd = 8, vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less
+  // than or equal to 64. For bd >= 10, vpx_highbd_convolve8() requires both
+  // x_step_q4 and y_step_q4 are less than or equal to 32. If this condition
+  // isn't met, it needs to call vp9_scale_and_extend_frame_nonnormative() that
+  // supports arbitrary scaling.
+  const int x_step_q4 = 16 * src_w / dst_w;
+  const int y_step_q4 = 16 * src_h / dst_h;
+  const int is_arbitrary_scaling =
+      (bd == 8 && (x_step_q4 > 64 || y_step_q4 > 64)) ||
+      (bd >= 10 && (x_step_q4 > 32 || y_step_q4 > 32));
+  if (is_arbitrary_scaling) {
+    vp9_scale_and_extend_frame_nonnormative(src, dst, bd);
+    return;
+  }
+
   const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
                                    src->v_buffer };
   const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
@@ -3295,7 +3389,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
   vp9_denoiser_update_ref_frame(cpi);
 #endif
 
-  if (is_one_pass_cbr_svc(cpi)) vp9_svc_update_ref_frame(cpi);
+  if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi);
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
@@ -3314,6 +3408,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
     return;
   }
 
+  if (cpi->loopfilter_ctrl == NO_LOOPFILTER ||
+      (!is_reference_frame && cpi->loopfilter_ctrl == LOOPFILTER_REFERENCE)) {
+    lf->filter_level = 0;
+    vpx_extend_frame_inner_borders(cm->frame_to_show);
+    return;
+  }
+
   if (xd->lossless) {
     lf->filter_level = 0;
     lf->last_filt_level = 0;
@@ -3353,19 +3454,6 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   vpx_extend_frame_inner_borders(cm->frame_to_show);
 }
 
-static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
-  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
-  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
-      new_fb_ptr->mi_cols < cm->mi_cols) {
-    vpx_free(new_fb_ptr->mvs);
-    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
-                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
-                                         sizeof(*new_fb_ptr->mvs)));
-    new_fb_ptr->mi_rows = cm->mi_rows;
-    new_fb_ptr->mi_cols = cm->mi_cols;
-  }
-}
-
 void vp9_scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
@@ -3683,6 +3771,10 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
     cpi->rc.force_max_q = 0;
   }
 
+  if (cpi->use_svc) {
+    cpi->svc.base_qindex[cpi->svc.spatial_layer_id] = *q;
+  }
+
   if (!frame_is_intra_only(cm)) {
     vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
   }
@@ -3708,9 +3800,9 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
       case 6: l = 150; break;
     }
     if (!cpi->common.postproc_state.limits) {
-      cpi->common.postproc_state.limits =
-          vpx_calloc(cpi->un_scaled_source->y_width,
-                     sizeof(*cpi->common.postproc_state.limits));
+      CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits,
+                      vpx_calloc(cpi->un_scaled_source->y_width,
+                                 sizeof(*cpi->common.postproc_state.limits)));
     }
     vp9_denoise(&cpi->common, cpi->Source, cpi->Source, l,
                 cpi->common.postproc_state.limits);
@@ -3797,6 +3889,7 @@ static void set_frame_size(VP9_COMP *cpi) {
   alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
+  int has_valid_ref_frame = 0;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
     const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
@@ -3815,22 +3908,25 @@ static void set_frame_size(VP9_COMP *cpi) {
                                         buf->y_crop_height, cm->width,
                                         cm->height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+      has_valid_ref_frame |= vp9_is_valid_scale(&ref_buf->sf);
       if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf);
     } else {
       ref_buf->buf = NULL;
     }
   }
+  if (!frame_is_intra_only(cm) && !has_valid_ref_frame) {
+    vpx_internal_error(
+        &cm->error, VPX_CODEC_CORRUPT_FRAME,
+        "Can't find at least one reference frame with valid size");
+  }
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 static void save_encode_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  int tile_col, tile_row;
+  int tile_idx;
   int i, j;
+  TileDataEnc *tile_data;
   RD_OPT *rd_opt = &cpi->rd;
   for (i = 0; i < MAX_REF_FRAMES; i++) {
     for (j = 0; j < REFERENCE_MODES; j++)
@@ -3841,21 +3937,12 @@ static void save_encode_params(VP9_COMP *cpi) {
       rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j];
   }
 
-  if (cpi->tile_data != NULL) {
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        for (i = 0; i < BLOCK_SIZES; ++i) {
-          for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact_prev[i][j] =
-                tile_data->thresh_freq_fact[i][j];
-          }
-        }
-      }
+  for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+    assert(cpi->tile_data);
+    tile_data = &cpi->tile_data[tile_idx];
+    vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact);
   }
 }
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 
 static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
 #ifdef ENABLE_KF_DENOISE
@@ -3878,11 +3965,11 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   int q = 0, bottom_index = 0, top_index = 0;
   int no_drop_scene_change = 0;
   const INTERP_FILTER filter_scaler =
-      (is_one_pass_cbr_svc(cpi))
+      (is_one_pass_svc(cpi))
           ? svc->downsample_filter_type[svc->spatial_layer_id]
           : EIGHTTAP;
   const int phase_scaler =
-      (is_one_pass_cbr_svc(cpi))
+      (is_one_pass_svc(cpi))
           ? svc->downsample_filter_phase[svc->spatial_layer_id]
           : 0;
 
@@ -3903,7 +3990,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   set_frame_size(cpi);
 
-  if (is_one_pass_cbr_svc(cpi) &&
+  if (is_one_pass_svc(cpi) &&
       cpi->un_scaled_source->y_width == cm->width << 2 &&
       cpi->un_scaled_source->y_height == cm->height << 2 &&
       svc->scaled_temp.y_width == cm->width << 1 &&
@@ -3917,7 +4004,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
         cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
         filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
     svc->scaled_one_half = 1;
-  } else if (is_one_pass_cbr_svc(cpi) &&
+  } else if (is_one_pass_svc(cpi) &&
              cpi->un_scaled_source->y_width == cm->width << 1 &&
              cpi->un_scaled_source->y_height == cm->height << 1 &&
              svc->scaled_one_half) {
@@ -3932,7 +4019,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   }
 #ifdef OUTPUT_YUV_SVC_SRC
   // Write out at most 3 spatial layers.
-  if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) {
+  if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) {
     vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source);
   }
 #endif
@@ -4002,6 +4089,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   cpi->rc.hybrid_intra_scene_change = 0;
   cpi->rc.re_encode_maxq_scene_change = 0;
   if (cm->show_frame && cpi->oxcf.mode == REALTIME &&
+      !cpi->disable_scene_detection_rtc_ratectrl &&
       (cpi->oxcf.rc_mode == VPX_VBR ||
        cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
        (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
@@ -4041,11 +4129,14 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     if (vp9_rc_drop_frame(cpi)) return 0;
   }
 
-  // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
+  // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame
   // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
   // avoid this frame-level upsampling (for non intra_only frames).
+  // For SVC single_layer mode, dynamic resize is allowed and we need to
+  // scale references for this case.
   if (frame_is_intra_only(cm) == 0 &&
-      !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref)) {
+      ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) ||
+       !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) {
     vp9_scale_references(cpi);
   }
 
@@ -4061,7 +4152,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
       svc->spatial_layer_id == svc->number_spatial_layers - 2) {
     if (svc->prev_partition_svc == NULL) {
       CHECK_MEM_ERROR(
-          cm, svc->prev_partition_svc,
+          &cm->error, svc->prev_partition_svc,
           (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
                                    sizeof(*svc->prev_partition_svc)));
     }
@@ -4130,11 +4221,22 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
   } else {
 #endif
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    // If ROI is enabled and skip feature is used for segmentation, apply cyclic
+    // refresh but not apply ROI for skip for the first 20 frames (defined by
+    // FRAMES_NO_SKIPPING_AFTER_KEY) after key frame to improve quality.
+    if (cpi->roi.enabled && !frame_is_intra_only(cm)) {
+      if (cpi->roi.skip[BACKGROUND_SEG_SKIP_ID]) {
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+          vp9_cyclic_refresh_setup(cpi);
+        if (cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)
+          apply_roi_map(cpi);
+      } else {
+        apply_roi_map(cpi);
+      }
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp9_cyclic_refresh_setup(cpi);
-    } else if (cpi->roi.enabled && !frame_is_intra_only(cm)) {
-      apply_roi_map(cpi);
     }
+
 #if !CONFIG_REALTIME_ONLY
   }
 #endif
@@ -4180,7 +4282,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   // Update some stats from cyclic refresh, and check for golden frame update.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-      !frame_is_intra_only(cm))
+      !frame_is_intra_only(cm) && cpi->cyclic_refresh->content_mode)
     vp9_cyclic_refresh_postencode(cpi);
 
   // Update the skip mb flag probabilities based on the distribution
@@ -4190,6 +4292,27 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   return 1;
 }
 
+static int get_ref_frame_flags(const VP9_COMP *cpi) {
+  const int *const map = cpi->common.ref_frame_map;
+  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  if (gold_is_last) flags &= ~VP9_GOLD_FLAG;
+
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+      (cpi->svc.number_temporal_layers == 1 &&
+       cpi->svc.number_spatial_layers == 1))
+    flags &= ~VP9_GOLD_FLAG;
+
+  if (alt_is_last) flags &= ~VP9_ALT_FLAG;
+
+  if (gold_is_alt) flags &= ~VP9_ALT_FLAG;
+
+  return flags;
+}
+
 #if !CONFIG_REALTIME_ONLY
 #define MAX_QSTEP_ADJ 4
 static int get_qstep_adj(int rate_excess, int rate_limit) {
@@ -4198,8 +4321,149 @@ static int get_qstep_adj(int rate_excess, int rate_limit) {
   return VPXMIN(qstep, MAX_QSTEP_ADJ);
 }
 
-static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
-                                    uint8_t *dest) {
+#if CONFIG_RATE_CTRL
+static void init_rq_history(RATE_QINDEX_HISTORY *rq_history) {
+  rq_history->recode_count = 0;
+  rq_history->q_index_high = 255;
+  rq_history->q_index_low = 0;
+}
+
+static void update_rq_history(RATE_QINDEX_HISTORY *rq_history, int target_bits,
+                              int actual_bits, int q_index) {
+  rq_history->q_index_history[rq_history->recode_count] = q_index;
+  rq_history->rate_history[rq_history->recode_count] = actual_bits;
+  if (actual_bits <= target_bits) {
+    rq_history->q_index_high = q_index;
+  }
+  if (actual_bits >= target_bits) {
+    rq_history->q_index_low = q_index;
+  }
+  rq_history->recode_count += 1;
+}
+
+static int guess_q_index_from_model(const RATE_QSTEP_MODEL *rq_model,
+                                    int target_bits) {
+  // The model predicts bits as follows.
+  // target_bits = bias - ratio * log2(q_step)
+  // Given the target_bits, we compute the q_step as follows.
+  double q_step;
+  assert(rq_model->ratio > 0);
+  q_step = pow(2.0, (rq_model->bias - target_bits) / rq_model->ratio);
+  // TODO(angiebird): Make this function support highbitdepth.
+  return vp9_convert_q_to_qindex(q_step, VPX_BITS_8);
+}
+
+static int guess_q_index_linear(int prev_q_index, int target_bits,
+                                int actual_bits, int gap) {
+  int q_index = prev_q_index;
+  if (actual_bits < target_bits) {
+    q_index -= gap;
+    q_index = VPXMAX(q_index, 0);
+  } else {
+    q_index += gap;
+    q_index = VPXMIN(q_index, 255);
+  }
+  return q_index;
+}
+
+static double get_bits_percent_diff(int target_bits, int actual_bits) {
+  double diff;
+  target_bits = VPXMAX(target_bits, 1);
+  diff = abs(target_bits - actual_bits) * 1. / target_bits;
+  return diff * 100;
+}
+
+static int rq_model_predict_q_index(const RATE_QSTEP_MODEL *rq_model,
+                                    const RATE_QINDEX_HISTORY *rq_history,
+                                    int target_bits) {
+  int q_index = 128;
+  if (rq_history->recode_count > 0) {
+    const int actual_bits =
+        rq_history->rate_history[rq_history->recode_count - 1];
+    const int prev_q_index =
+        rq_history->q_index_history[rq_history->recode_count - 1];
+    const double percent_diff = get_bits_percent_diff(target_bits, actual_bits);
+    if (percent_diff > 50) {
+      // Binary search.
+      // When the actual_bits and target_bits are far apart, binary search
+      // q_index is faster.
+      q_index = (rq_history->q_index_low + rq_history->q_index_high) / 2;
+    } else {
+      if (rq_model->ready) {
+        q_index = guess_q_index_from_model(rq_model, target_bits);
+      } else {
+        // TODO(angiebird): Find a better way to set the gap.
+        q_index =
+            guess_q_index_linear(prev_q_index, target_bits, actual_bits, 20);
+      }
+    }
+  } else {
+    if (rq_model->ready) {
+      q_index = guess_q_index_from_model(rq_model, target_bits);
+    }
+  }
+
+  assert(rq_history->q_index_low <= rq_history->q_index_high);
+  if (q_index <= rq_history->q_index_low) {
+    q_index = rq_history->q_index_low + 1;
+  }
+  if (q_index >= rq_history->q_index_high) {
+    q_index = rq_history->q_index_high - 1;
+  }
+  return q_index;
+}
+
+static void rq_model_update(const RATE_QINDEX_HISTORY *rq_history,
+                            int target_bits, RATE_QSTEP_MODEL *rq_model) {
+  const int recode_count = rq_history->recode_count;
+  const double delta = 0.00001;
+  if (recode_count >= 2) {
+    const int q_index1 = rq_history->q_index_history[recode_count - 2];
+    const int q_index2 = rq_history->q_index_history[recode_count - 1];
+    const int r1 = rq_history->rate_history[recode_count - 2];
+    const int r2 = rq_history->rate_history[recode_count - 1];
+    int valid = 0;
+    // lower q_index should yield higher bit rate
+    if (q_index1 < q_index2) {
+      valid = r1 > r2;
+    } else if (q_index1 > q_index2) {
+      valid = r1 < r2;
+    }
+    // Only update the model when the q_index and rate behave normally.
+    if (valid) {
+      // Fit the ratio and bias of rq_model based on last two recode histories.
+      const double s1 = vp9_convert_qindex_to_q(q_index1, VPX_BITS_8);
+      const double s2 = vp9_convert_qindex_to_q(q_index2, VPX_BITS_8);
+      if (fabs(log2(s1) - log2(s2)) > delta) {
+        rq_model->ratio = (r2 - r1) / (log2(s1) - log2(s2));
+        rq_model->bias = r1 + (rq_model->ratio) * log2(s1);
+        if (rq_model->ratio > delta && rq_model->bias > delta) {
+          rq_model->ready = 1;
+        }
+      }
+    }
+  } else if (recode_count == 1) {
+    if (rq_model->ready) {
+      // Update the ratio only when the initial model exists and we only have
+      // one recode history.
+      const int prev_q = rq_history->q_index_history[recode_count - 1];
+      const double prev_q_step = vp9_convert_qindex_to_q(prev_q, VPX_BITS_8);
+      if (fabs(log2(prev_q_step)) > delta) {
+        const int actual_bits = rq_history->rate_history[recode_count - 1];
+        rq_model->ratio =
+            rq_model->ratio + (target_bits - actual_bits) / log2(prev_q_step);
+      }
+    }
+  }
+}
+#endif  // CONFIG_RATE_CTRL
+
+static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
+#if CONFIG_RATE_CTRL
+                                    ,
+                                    RATE_QINDEX_HISTORY *rq_history
+#endif  // CONFIG_RATE_CTRL
+) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -4217,6 +4481,39 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
   int qrange_adj = 1;
 #endif
 
+  // A flag which indicates whether we are recoding the current frame
+  // when the current frame size is larger than the max frame size in the
+  // external rate control model.
+  // This flag doesn't have any impact when external rate control is not used.
+  int ext_rc_recode = 0;
+  // Maximal frame size allowed by the external rate control.
+  // case: 0, we ignore the max frame size limit, and encode with the qindex
+  // passed in by the external rate control model.
+  // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex
+  // and may recode if undershoot/overshoot is seen.
+  // If the external qindex is not VPX_DEFAULT_Q, we force no recode.
+  // case: -1, we take libvpx's decision for the max frame size, as well as
+  // the recode decision.
+  // Otherwise: if a specific size is given, libvpx's recode decision
+  // will respect the given size.
+  int ext_rc_max_frame_size = 0;
+  // Use VP9's decision of qindex. This flag is in use only in external rate
+  // control model to help determine whether to recode when
+  // |ext_rc_max_frame_size| is 0.
+  int ext_rc_use_default_q = 1;
+  const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
+
+#if CONFIG_RATE_CTRL
+  RATE_QSTEP_MODEL *rq_model;
+  {
+    const FRAME_UPDATE_TYPE update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+    const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
+    rq_model = &cpi->rq_model[frame_type];
+  }
+  init_rq_history(rq_history);
+#endif  // CONFIG_RATE_CTRL
+
   if (cm->show_existing_frame) {
     rc->this_frame_target = 0;
     if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi);
@@ -4229,6 +4526,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
                                        (cpi->twopass.gf_group.index == 1)
                                  : 0;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame: \n");
+#endif
   do {
     vpx_clear_system_state();
 
@@ -4305,10 +4605,50 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
 #if CONFIG_RATE_CTRL
     // TODO(angiebird): This is a hack for making sure the encoder use the
     // external_quantize_index exactly. Avoid this kind of hack later.
-    if (cpi->encode_command.use_external_quantize_index) {
-      q = cpi->encode_command.external_quantize_index;
+    if (cpi->oxcf.use_simple_encode_api) {
+      if (cpi->encode_command.use_external_target_frame_bits) {
+        q = rq_model_predict_q_index(rq_model, rq_history,
+                                     rc->this_frame_target);
+      }
+      if (cpi->encode_command.use_external_quantize_index) {
+        q = cpi->encode_command.external_quantize_index;
+      }
+    }
+#endif  // CONFIG_RATE_CTRL
+    if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+        cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
+      vpx_codec_err_t codec_status;
+      const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+      vpx_rc_encodeframe_decision_t encode_frame_decision;
+      FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+      const int ref_frame_flags = get_ref_frame_flags(cpi);
+      RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+      const RefCntBuffer *curr_frame_buf =
+          get_ref_cnt_buffer(cm, cm->new_fb_idx);
+      // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+      // index 1 refers to the first encoding frame in a gf group.
+      // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+      // See function define_gf_group_structure().
+      const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
+      get_ref_frame_bufs(cpi, ref_frame_bufs);
+      codec_status = vp9_extrc_get_encodeframe_decision(
+          &cpi->ext_ratectrl, curr_frame_buf->frame_index,
+          cm->current_frame_coding_index, gf_group->index, update_type,
+          gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+          &encode_frame_decision);
+      if (codec_status != VPX_CODEC_OK) {
+        vpx_internal_error(&cm->error, codec_status,
+                           "vp9_extrc_get_encodeframe_decision() failed");
+      }
+      // If the external model recommends a reserved value, we use
+      // libvpx's default q.
+      if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
+        q = encode_frame_decision.q_index;
+        ext_rc_use_default_q = 0;
+      }
+      ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
     }
-#endif
 
     vp9_set_quantizer(cpi, q);
 
@@ -4348,15 +4688,57 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
     }
 
+    if (cpi->ext_ratectrl.ready &&
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
+      // In general, for the external rate control, we take the qindex provided
+      // as input and encode the frame with this qindex faithfully. However,
+      // in some extreme scenarios, the provided qindex leads to a massive
+      // overshoot of frame size. In this case, we fall back to VP9's decision
+      // to pick a new qindex and recode the frame. We return the new qindex
+      // through the API to the external model.
+      if (ext_rc_max_frame_size == 0) {
+        if (!ext_rc_use_default_q) break;
+      } else if (ext_rc_max_frame_size == -1) {
+        // Do nothing, fall back to libvpx's recode decision.
+      } else {
+        // Change the max frame size, used in libvpx's recode decision.
+        rc->max_frame_bandwidth = ext_rc_max_frame_size;
+      }
+      ext_rc_recode = 1;
+    }
 #if CONFIG_RATE_CTRL
-    // This part needs to be after save_coding_context() because
-    // restore_coding_context will be called in the end of this function.
-    // TODO(angiebird): This is a hack for making sure the encoder use the
-    // external_quantize_index exactly. Avoid this kind of hack later.
-    if (cpi->encode_command.use_external_quantize_index) {
-      break;
+    if (cpi->oxcf.use_simple_encode_api) {
+      // This part needs to be after save_coding_context() because
+      // restore_coding_context will be called in the end of this function.
+      // TODO(angiebird): This is a hack for making sure the encoder use the
+      // external_quantize_index exactly. Avoid this kind of hack later.
+      if (cpi->encode_command.use_external_quantize_index) {
+        break;
+      }
+
+      if (cpi->encode_command.use_external_target_frame_bits) {
+        const double percent_diff = get_bits_percent_diff(
+            rc->this_frame_target, rc->projected_frame_size);
+        update_rq_history(rq_history, rc->this_frame_target,
+                          rc->projected_frame_size, q);
+        loop_count += 1;
+
+        rq_model_update(rq_history, rc->this_frame_target, rq_model);
+
+        // Check if we hit the target bitrate.
+        if (percent_diff <=
+                cpi->encode_command.target_frame_bits_error_percent ||
+            rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM ||
+            rq_history->q_index_low >= rq_history->q_index_high) {
+          break;
+        }
+
+        loop = 1;
+        restore_coding_context(cpi);
+        continue;
+      }
     }
-#endif
+#endif  // CONFIG_RATE_CTRL
 
     if (oxcf->rc_mode == VPX_Q) {
       loop = 0;
@@ -4535,8 +4917,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
 
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
       if (loop) restore_coding_context(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    if (loop) printf("\n Recoding:");
+#endif
   } while (loop);
 
+  rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth;
+
 #ifdef AGGRESSIVE_VBR
   if (two_pass_first_group_inter(cpi)) {
     cpi->twopass.active_worst_quality =
@@ -4571,27 +4958,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static int get_ref_frame_flags(const VP9_COMP *cpi) {
-  const int *const map = cpi->common.ref_frame_map;
-  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
-  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
-  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  if (gold_is_last) flags &= ~VP9_GOLD_FLAG;
-
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
-      (cpi->svc.number_temporal_layers == 1 &&
-       cpi->svc.number_spatial_layers == 1))
-    flags &= ~VP9_GOLD_FLAG;
-
-  if (alt_is_last) flags &= ~VP9_ALT_FLAG;
-
-  if (gold_is_alt) flags &= ~VP9_ALT_FLAG;
-
-  return flags;
-}
-
 static void set_ext_overrides(VP9_COMP *cpi) {
   // Overrides the defaults with the externally supplied values with
   // vp9_update_reference() and vp9_update_entropy() calls
@@ -4651,13 +5017,14 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
         scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
                                filter_type, phase_scaler);
     else
-      scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+      vp9_scale_and_extend_frame_nonnormative(unscaled, scaled,
+                                              (int)cm->bit_depth);
 #else
     if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
         unscaled->y_height <= (scaled->y_height << 1))
       vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
     else
-      scale_and_extend_frame_nonnormative(unscaled, scaled);
+      vp9_scale_and_extend_frame_nonnormative(unscaled, scaled);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     return scaled;
   } else {
@@ -4709,8 +5076,8 @@ static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
 
 #ifdef ENABLE_KF_DENOISE
 // Baseline kernel weights for denoise
-static uint8_t dn_kernal_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
-static uint8_t dn_kernal_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
+static uint8_t dn_kernel_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
+static uint8_t dn_kernel_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
                                    2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 };
 
 static INLINE void add_denoise_point(int centre_val, int data_val, int thresh,
@@ -4727,17 +5094,17 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
   int sum_weight = 0;
   int sum_val = 0;
   int thresh = strength;
-  int kernal_size = 5;
+  int kernel_size = 5;
   int half_k_size = 2;
   int i, j;
   int max_diff = 0;
   uint8_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
 
   // Find the maximum deviation from the source point in the locale.
   tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
-  for (i = 0; i < kernal_size + 2; ++i) {
-    for (j = 0; j < kernal_size + 2; ++j) {
+  for (i = 0; i < kernel_size + 2; ++i) {
+    for (j = 0; j < kernel_size + 2; ++j) {
       max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j]));
     }
     tmp_ptr += stride;
@@ -4745,19 +5112,19 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
 
   // Select the kernel size.
   if (max_diff > (strength + (strength >> 1))) {
-    kernal_size = 3;
+    kernel_size = 3;
     half_k_size = 1;
     thresh = thresh >> 1;
   }
-  kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+  kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
 
   // Apply the kernel
   tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
-  for (i = 0; i < kernal_size; ++i) {
-    for (j = 0; j < kernal_size; ++j) {
-      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+  for (i = 0; i < kernel_size; ++i) {
+    for (j = 0; j < kernel_size; ++j) {
+      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
                         &sum_val, &sum_weight);
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -4772,17 +5139,17 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
   int sum_weight = 0;
   int sum_val = 0;
   int thresh = strength;
-  int kernal_size = 5;
+  int kernel_size = 5;
   int half_k_size = 2;
   int i, j;
   int max_diff = 0;
   uint16_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
 
   // Find the maximum deviation from the source point in the locale.
   tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
-  for (i = 0; i < kernal_size + 2; ++i) {
-    for (j = 0; j < kernal_size + 2; ++j) {
+  for (i = 0; i < kernel_size + 2; ++i) {
+    for (j = 0; j < kernel_size + 2; ++j) {
       max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j]));
     }
     tmp_ptr += stride;
@@ -4790,19 +5157,19 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
 
   // Select the kernel size.
   if (max_diff > (strength + (strength >> 1))) {
-    kernal_size = 3;
+    kernel_size = 3;
     half_k_size = 1;
     thresh = thresh >> 1;
   }
-  kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+  kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
 
   // Apply the kernel
   tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
-  for (i = 0; i < kernal_size; ++i) {
-    for (j = 0; j < kernal_size; ++j) {
-      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+  for (i = 0; i < kernel_size; ++i) {
+    for (j = 0; j < kernel_size; ++j) {
+      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
                         &sum_val, &sum_weight);
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -4896,9 +5263,7 @@ static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     ref_buffer->frame_index =
         cm->current_video_frame + gf_group->arf_src_offset[gf_group->index];
-#if CONFIG_RATE_CTRL
     ref_buffer->frame_coding_index = cm->current_frame_coding_index;
-#endif  // CONFIG_RATE_CTRL
   }
 }
 
@@ -4991,7 +5356,7 @@ static void init_mb_wiener_var_buffer(VP9_COMP *cpi) {
   cpi->mb_wiener_variance = NULL;
 
   CHECK_MEM_ERROR(
-      cm, cpi->mb_wiener_variance,
+      &cm->error, cpi->mb_wiener_variance,
       vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance)));
   cpi->mb_wiener_var_rows = cm->mb_rows;
   cpi->mb_wiener_var_cols = cm->mb_cols;
@@ -5050,16 +5415,16 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) {
         vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size,
                                   mb_buffer, buf_stride, zero_pred, block_size,
                                   xd->bd);
-        highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+        vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
       } else {
         vpx_subtract_block(block_size, block_size, src_diff, block_size,
                            mb_buffer, buf_stride, zero_pred, block_size);
-        wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+        vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
       }
 #else
       vpx_subtract_block(block_size, block_size, src_diff, block_size,
                          mb_buffer, buf_stride, zero_pred, block_size);
-      wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+      vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
       coeff[0] = 0;
@@ -5093,16 +5458,81 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) {
 }
 
 #if !CONFIG_REALTIME_ONLY
-static void update_encode_frame_result(
-    int ref_frame_flags, FRAME_UPDATE_TYPE update_type,
-    const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf,
-    RefCntBuffer *ref_frame_buf[MAX_INTER_REF_FRAMES], int quantize_index,
-    uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts,
-#if CONFIG_RATE_CTRL
+static void update_encode_frame_result_basic(
+    FRAME_UPDATE_TYPE update_type, int show_idx, int quantize_index,
+    ENCODE_FRAME_RESULT *encode_frame_result) {
+  encode_frame_result->show_idx = show_idx;
+  encode_frame_result->update_type = update_type;
+  encode_frame_result->quantize_index = quantize_index;
+}
+
+#if CONFIG_RATE_CTRL
+static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer,
+                                        IMAGE_BUFFER *image_buffer) {
+  const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer,
+                                   yv12_buffer->v_buffer };
+  const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride,
+                                 yv12_buffer->uv_stride };
+  const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width,
+                        yv12_buffer->uv_crop_width };
+  const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height,
+                        yv12_buffer->uv_crop_height };
+  int plane;
+  for (plane = 0; plane < 3; ++plane) {
+    const int src_stride = src_stride_ls[plane];
+    const int w = w_ls[plane];
+    const int h = h_ls[plane];
+    const uint8_t *src_buf = src_buf_ls[plane];
+    uint8_t *dst_buf = image_buffer->plane_buffer[plane];
+    int r;
+    assert(image_buffer->plane_width[plane] == w);
+    assert(image_buffer->plane_height[plane] == h);
+    for (r = 0; r < h; ++r) {
+      memcpy(dst_buf, src_buf, sizeof(*src_buf) * w);
+      src_buf += src_stride;
+      dst_buf += w;
+    }
+  }
+}
+// This function will update extra information specific for simple_encode APIs
+static void update_encode_frame_result_simple_encode(
+    int ref_frame_flags, FRAME_UPDATE_TYPE update_type,
+    const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index,
+    uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts,
     const PARTITION_INFO *partition_info,
     const MOTION_VECTOR_INFO *motion_vector_info,
+    const TplDepStats *tpl_stats_info,
+    ENCODE_FRAME_RESULT *encode_frame_result) {
+  PSNR_STATS psnr;
+  update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index,
+                                   quantize_index, encode_frame_result);
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth,
+                       input_bit_depth);
+#else   // CONFIG_VP9_HIGHBITDEPTH
+  (void)bit_depth;
+  (void)input_bit_depth;
+  vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index;
+
+  vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+                         encode_frame_result->ref_frame_coding_indexes,
+                         encode_frame_result->ref_frame_valid_list);
+
+  encode_frame_result->psnr = psnr.psnr[0];
+  encode_frame_result->sse = psnr.sse[0];
+  encode_frame_result->frame_counts = *counts;
+  encode_frame_result->partition_info = partition_info;
+  encode_frame_result->motion_vector_info = motion_vector_info;
+  encode_frame_result->tpl_stats_info = tpl_stats_info;
+  if (encode_frame_result->coded_frame.allocated) {
+    yv12_buffer_to_image_buffer(&coded_frame_buf->buf,
+                                &encode_frame_result->coded_frame);
+  }
+}
 #endif  // CONFIG_RATE_CTRL
-    ENCODE_FRAME_RESULT *encode_frame_result);
 #endif  // !CONFIG_REALTIME_ONLY
 
 static void encode_frame_to_data_rate(
@@ -5113,26 +5543,7 @@ static void encode_frame_to_data_rate(
   struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
 
-  // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
-  // No need to set svc.skip_enhancement_layer if whole superframe will be
-  // dropped.
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
-      cpi->oxcf.target_bandwidth == 0 &&
-      !(cpi->svc.framedrop_mode != LAYER_DROP &&
-        (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
-         cpi->svc
-             .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
-                                                1]) &&
-        cpi->svc.drop_spatial_layer[0])) {
-    cpi->svc.skip_enhancement_layer = 1;
-    vp9_rc_postencode_update_drop_frame(cpi);
-    cpi->ext_refresh_frame_flags_pending = 0;
-    cpi->last_frame_dropped = 1;
-    cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
-    cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
-    vp9_inc_frame_in_layer(cpi);
-    return;
-  }
+  if (vp9_svc_check_skip_enhancement_layer(cpi)) return;
 
   set_ext_overrides(cpi);
   vpx_clear_system_state();
@@ -5150,6 +5561,11 @@ static void encode_frame_to_data_rate(
     set_ref_sign_bias(cpi);
   }
 
+  // On the very first frame set the deadline_mode_previous_frame to
+  // the current mode.
+  if (cpi->common.current_video_frame == 0)
+    cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
+
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
@@ -5197,17 +5613,52 @@ static void encode_frame_to_data_rate(
   memset(cpi->mode_chosen_counts, 0,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   // Backup to ensure consistency between recodes
   save_encode_params(cpi);
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.funcs.get_frame_rdmult != NULL) {
+    vpx_codec_err_t codec_status;
+    const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+    const int ref_frame_flags = get_ref_frame_flags(cpi);
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+    const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+    // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+    // index 1 refers to the first encoding frame in a gf group.
+    // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+    // See function define_gf_group_structure().
+    const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
+    int ext_rdmult = VPX_DEFAULT_RDMULT;
+    get_ref_frame_bufs(cpi, ref_frame_bufs);
+    codec_status = vp9_extrc_get_frame_rdmult(
+        &cpi->ext_ratectrl, curr_frame_buf->frame_index,
+        cm->current_frame_coding_index, gf_group->index, update_type,
+        gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+        &ext_rdmult);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_frame_rdmult() failed");
+    }
+    cpi->ext_ratectrl.ext_rdmult = ext_rdmult;
+  }
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
     if (!encode_without_recode_loop(cpi, size, dest)) return;
   } else {
 #if !CONFIG_REALTIME_ONLY
+#if CONFIG_RATE_CTRL
+    encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history);
+#else  // CONFIG_RATE_CTRL
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_with_recode_loop_time);
+#endif
     encode_with_recode_loop(cpi, size, dest);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_with_recode_loop_time);
 #endif
+#endif  // CONFIG_RATE_CTRL
+#endif  // !CONFIG_REALTIME_ONLY
   }
 
   // TODO(jingning): When using show existing frame mode, we assume that the
@@ -5264,23 +5715,49 @@ static void encode_frame_to_data_rate(
   cm->frame_to_show->render_width = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loopfilter_frame_time);
+#endif
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loopfilter_frame_time);
+#endif
 
   if (cpi->rc.use_post_encode_drop) save_coding_context(cpi);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, vp9_pack_bitstream_time);
+#endif
   // build the bitstream
   vp9_pack_bitstream(cpi, dest, size);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, vp9_pack_bitstream_time);
+#endif
 
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.update_encodeframe_result != NULL) {
+    const RefCntBuffer *coded_frame_buf =
+        get_ref_cnt_buffer(cm, cm->new_fb_idx);
+    vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
+        &cpi->ext_ratectrl, (*size) << 3, cpi->Source, &coded_frame_buf->buf,
+        cm->bit_depth, cpi->oxcf.input_bit_depth, cm->base_qindex);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_update_encodeframe_result() failed");
+    }
+  }
 #if CONFIG_REALTIME_ONLY
   (void)encode_frame_result;
   assert(encode_frame_result == NULL);
 #else  // CONFIG_REALTIME_ONLY
   if (encode_frame_result != NULL) {
-    const int ref_frame_flags = get_ref_frame_flags(cpi);
     const RefCntBuffer *coded_frame_buf =
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+    FRAME_UPDATE_TYPE update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+    int quantize_index = vp9_get_quantizer(cpi);
     get_ref_frame_bufs(cpi, ref_frame_bufs);
     // update_encode_frame_result() depends on twopass.gf_group.index and
     // cm->new_fb_idx, cpi->Source, cpi->lst_fb_idx, cpi->gld_fb_idx and
@@ -5298,15 +5775,21 @@ static void encode_frame_to_data_rate(
     // This function needs to be called before vp9_update_reference_frames().
     // TODO(angiebird): Improve the codebase to make the update of frame
     // dependent variables more robust.
-    update_encode_frame_result(
-        ref_frame_flags,
-        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
-        cpi->Source, coded_frame_buf, ref_frame_bufs, vp9_get_quantizer(cpi),
-        cpi->oxcf.input_bit_depth, cm->bit_depth, cpi->td.counts,
+
+    update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index,
+                                     quantize_index, encode_frame_result);
 #if CONFIG_RATE_CTRL
-        cpi->partition_info, cpi->motion_vector_info,
+    if (cpi->oxcf.use_simple_encode_api) {
+      const int ref_frame_flags = get_ref_frame_flags(cpi);
+      update_encode_frame_result_simple_encode(
+          ref_frame_flags,
+          cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+          cpi->Source, coded_frame_buf, ref_frame_bufs, quantize_index,
+          cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts,
+          cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info,
+          encode_frame_result);
+    }
 #endif  // CONFIG_RATE_CTRL
-        encode_frame_result);
   }
 #endif  // CONFIG_REALTIME_ONLY
 
@@ -5372,7 +5855,8 @@ static void encode_frame_to_data_rate(
 
   vp9_rc_postencode_update(cpi, *size);
 
-  if (oxcf->pass == 0 && !frame_is_intra_only(cm) &&
+  if (cpi->compute_frame_low_motion_onepass && oxcf->pass == 0 &&
+      !frame_is_intra_only(cm) &&
       (!cpi->use_svc ||
        (cpi->use_svc &&
         !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
@@ -5840,1706 +6324,194 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   }
 }
 
-typedef struct GF_PICTURE {
-  YV12_BUFFER_CONFIG *frame;
-  int ref_frame[3];
-  FRAME_UPDATE_TYPE update_type;
-} GF_PICTURE;
+void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
+                            RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES],
+                            int *ref_frame_coding_indexes,
+                            int *ref_frame_valid_list) {
+  if (update_type != KF_UPDATE) {
+    const VP9_REFFRAME inter_ref_flags[MAX_INTER_REF_FRAMES] = { VP9_LAST_FLAG,
+                                                                 VP9_GOLD_FLAG,
+                                                                 VP9_ALT_FLAG };
+    int i;
+    for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) {
+      assert(ref_frame_bufs[i] != NULL);
+      ref_frame_coding_indexes[i] = ref_frame_bufs[i]->frame_coding_index;
+      ref_frame_valid_list[i] = (ref_frame_flags & inter_ref_flags[i]) != 0;
+    }
+  } else {
+    // No reference frame is available when this is a key frame.
+    int i;
+    for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) {
+      ref_frame_coding_indexes[i] = -1;
+      ref_frame_valid_list[i] = 0;
+    }
+  }
+}
+
+void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) {
+  encode_frame_result->show_idx = -1;  // Actual encoding doesn't happen.
+#if CONFIG_RATE_CTRL
+  encode_frame_result->frame_coding_index = -1;
+  vp9_zero(encode_frame_result->coded_frame);
+  encode_frame_result->coded_frame.allocated = 0;
+  init_rq_history(&encode_frame_result->rq_history);
+#endif  // CONFIG_RATE_CTRL
+}
 
-static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
-                            const GF_GROUP *gf_group, int *tpl_group_frames) {
-  VP9_COMMON *cm = &cpi->common;
-  int frame_idx = 0;
+int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest, int64_t *time_stamp,
+                            int64_t *time_end, int flush,
+                            ENCODE_FRAME_RESULT *encode_frame_result) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  VP9_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RATE_CONTROL *const rc = &cpi->rc;
+  struct vpx_usec_timer cmptimer;
+  YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+  struct lookahead_entry *last_source = NULL;
+  struct lookahead_entry *source = NULL;
+  int arf_src_index;
+  const int gf_group_index = cpi->twopass.gf_group.index;
   int i;
-  int gld_index = -1;
-  int alt_index = -1;
-  int lst_index = -1;
-  int arf_index_stack[MAX_ARF_LAYERS];
-  int arf_stack_size = 0;
-  int extend_frame_count = 0;
-  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
-  int frame_gop_offset = 0;
 
-  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
-  int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time);
+#endif
 
-  memset(recon_frame_index, -1, sizeof(recon_frame_index));
-  stack_init(arf_index_stack, MAX_ARF_LAYERS);
+  if (is_one_pass_svc(cpi)) {
+    vp9_one_pass_svc_start_layer(cpi);
+  }
 
-  // TODO(jingning): To be used later for gf frame type parsing.
-  (void)gf_group;
+  vpx_usec_timer_start(&cmptimer);
 
-  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    if (frame_bufs[i].ref_count == 0) {
-      alloc_frame_mvs(cm, i);
-      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
-                                   cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                   cm->use_highbitdepth,
-#endif
-                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
-                                   NULL, NULL, NULL))
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to allocate frame buffer");
+  vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
-      recon_frame_index[frame_idx] = i;
-      ++frame_idx;
+  // Is multi-arf enabled.
+  // Note that at the moment multi_arf is only configured for 2 pass VBR and
+  // will not work properly with svc.
+  // Enable the Jingning's new "multi_layer_arf" code if "enable_auto_arf"
+  // is greater than or equal to 2.
+  if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf >= 2))
+    cpi->multi_layer_arf = 1;
+  else
+    cpi->multi_layer_arf = 0;
 
-      if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
-    }
+  // Normal defaults
+  cm->reset_frame_context = 0;
+  cm->refresh_frame_context = 1;
+  if (!is_one_pass_svc(cpi)) {
+    cpi->refresh_last_frame = 1;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_alt_ref_frame = 0;
   }
 
-  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
-    assert(recon_frame_index[i] >= 0);
-    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
-  }
-
-  *tpl_group_frames = 0;
-
-  // Initialize Golden reference frame.
-  gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-  for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
-  gf_picture[0].update_type = gf_group->update_type[0];
-  gld_index = 0;
-  ++*tpl_group_frames;
-
-  // Initialize base layer ARF frame
-  gf_picture[1].frame = cpi->Source;
-  gf_picture[1].ref_frame[0] = gld_index;
-  gf_picture[1].ref_frame[1] = lst_index;
-  gf_picture[1].ref_frame[2] = alt_index;
-  gf_picture[1].update_type = gf_group->update_type[1];
-  alt_index = 1;
-  ++*tpl_group_frames;
-
-  // Initialize P frames
-  for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
-    struct lookahead_entry *buf;
-    frame_gop_offset = gf_group->frame_gop_index[frame_idx];
-    buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
-
-    if (buf == NULL) break;
-
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
-
-    switch (gf_group->update_type[frame_idx]) {
-      case ARF_UPDATE:
-        stack_push(arf_index_stack, alt_index, arf_stack_size);
-        ++arf_stack_size;
-        alt_index = frame_idx;
-        break;
-      case LF_UPDATE: lst_index = frame_idx; break;
-      case OVERLAY_UPDATE:
-        gld_index = frame_idx;
-        alt_index = stack_pop(arf_index_stack, arf_stack_size);
-        --arf_stack_size;
+  // Should we encode an arf frame.
+  arf_src_index = get_arf_src_index(cpi);
+
+  if (arf_src_index) {
+    for (i = 0; i <= arf_src_index; ++i) {
+      struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i);
+      // Avoid creating an alt-ref if there's a forced keyframe pending.
+      if (e == NULL) {
         break;
-      case USE_BUF_FRAME:
-        lst_index = alt_index;
-        alt_index = stack_pop(arf_index_stack, arf_stack_size);
-        --arf_stack_size;
+      } else if (e->flags == VPX_EFLAG_FORCE_KF) {
+        arf_src_index = 0;
+        flush = 1;
         break;
-      default: break;
+      }
     }
-
-    ++*tpl_group_frames;
-
-    // The length of group of pictures is baseline_gf_interval, plus the
-    // beginning golden frame from last GOP, plus the last overlay frame in
-    // the same GOP.
-    if (frame_idx == gf_group->gf_group_size) break;
   }
 
-  alt_index = -1;
-  ++frame_idx;
-  ++frame_gop_offset;
-
-  // Extend two frames outside the current gf group.
-  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
-    struct lookahead_entry *buf =
-        vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
-
-    if (buf == NULL) break;
-
-    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
-
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    gf_picture[frame_idx].update_type = LF_UPDATE;
-    lst_index = frame_idx;
-    ++*tpl_group_frames;
-    ++extend_frame_count;
-    ++frame_gop_offset;
+  // Clear arf index stack before group of pictures processing starts.
+  if (gf_group_index == 1) {
+    stack_init(cpi->twopass.gf_group.arf_index_stack, MAX_LAG_BUFFERS * 2);
+    cpi->twopass.gf_group.stack_size = 0;
   }
-}
 
-static void init_tpl_stats(VP9_COMP *cpi) {
-  int frame_idx;
-  for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    memset(tpl_frame->tpl_stats_ptr, 0,
-           tpl_frame->height * tpl_frame->width *
-               sizeof(*tpl_frame->tpl_stats_ptr));
-    tpl_frame->is_valid = 0;
-  }
-}
+  if (arf_src_index) {
+    assert(arf_src_index <= rc->frames_to_key);
+    if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cpi->alt_ref_source = source;
 
-#if CONFIG_NON_GREEDY_MV
-static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
-                                         MotionField *motion_field,
-                                         int frame_idx, uint8_t *cur_frame_buf,
-                                         uint8_t *ref_frame_buf, int stride,
-                                         BLOCK_SIZE bsize, int mi_row,
-                                         int mi_col, MV *mv) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  int step_param;
-  uint32_t bestsme = UINT_MAX;
-  const MvLimits tmp_mv_limits = x->mv_limits;
-  // lambda is used to adjust the importance of motion vector consistency.
-  // TODO(angiebird): Figure out lambda's proper value.
-  const int lambda = cpi->tpl_stats[frame_idx].lambda;
-  int_mv nb_full_mvs[NB_MVS_NUM];
-  int nb_full_mv_num;
-
-  MV best_ref_mv1 = { 0, 0 };
-  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  step_param = mv_sf->reduce_first_step_size;
-  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
-  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
-  nb_full_mv_num =
-      vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
-  vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
-                             lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
-
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
-  return bestsme;
-}
-
-static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
-                                        uint8_t *cur_frame_buf,
-                                        uint8_t *ref_frame_buf, int stride,
-                                        BLOCK_SIZE bsize, MV *mv) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  uint32_t bestsme = UINT_MAX;
-  uint32_t distortion;
-  uint32_t sse;
-  int cost_list[5];
-
-  MV best_ref_mv1 = { 0, 0 };
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  // TODO(yunqing): may use higher tap interp filter than 2 taps.
-  // Ignore mv costing by sending NULL pointer instead of cost array
-  bestsme = cpi->find_fractional_mv_step(
-      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
-      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
-      USE_2_TAPS);
-
-  return bestsme;
-}
-
-#else  // CONFIG_NON_GREEDY_MV
-static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
-                                              uint8_t *cur_frame_buf,
-                                              uint8_t *ref_frame_buf,
-                                              int stride, BLOCK_SIZE bsize,
-                                              MV *mv) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS search_method = NSTEP;
-  int step_param;
-  int sadpb = x->sadperbit16;
-  uint32_t bestsme = UINT_MAX;
-  uint32_t distortion;
-  uint32_t sse;
-  int cost_list[5];
-  const MvLimits tmp_mv_limits = x->mv_limits;
-
-  MV best_ref_mv1 = { 0, 0 };
-  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  step_param = mv_sf->reduce_first_step_size;
-  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
-  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
-  vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
-                        search_method, sadpb, cond_cost_list(cpi, cost_list),
-                        &best_ref_mv1, mv, 0, 0);
-
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
-  // TODO(yunqing): may use higher tap interp filter than 2 taps.
-  // Ignore mv costing by sending NULL pointer instead of cost array
-  bestsme = cpi->find_fractional_mv_step(
-      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
-      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
-      USE_2_TAPS);
-
-  return bestsme;
-}
-#endif
+#if !CONFIG_REALTIME_ONLY
+      if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) &&
+          (oxcf->arnr_strength > 0)) {
+        int bitrate = cpi->rc.avg_frame_bandwidth / 40;
+        int not_low_bitrate = bitrate > ALT_REF_AQ_LOW_BITRATE_BOUNDARY;
 
-static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
-                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
-  int width = 0, height = 0;
-  int bw = 4 << b_width_log2_lookup[bsize];
-  int bh = 4 << b_height_log2_lookup[bsize];
+        int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1);
+        not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME;
 
-  switch (block) {
-    case 0:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 1:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 2:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    case 3:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    default: assert(0);
-  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, vp9_temporal_filter_time);
+#endif
+        // Produce the filtered ARF frame.
+        vp9_temporal_filter(cpi, arf_src_index);
+        vpx_extend_frame_borders(&cpi->alt_ref_buffer);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, vp9_temporal_filter_time);
+#endif
 
-  return width * height;
-}
+        // for small bitrates segmentation overhead usually
+        // eats all bitrate gain from enabling delta quantizers
+        if (cpi->oxcf.alt_ref_aq != 0 && not_low_bitrate && not_last_frame)
+          vp9_alt_ref_aq_setup_mode(cpi->alt_ref_aq, cpi);
 
-static int round_floor(int ref_pos, int bsize_pix) {
-  int round;
-  if (ref_pos < 0)
-    round = -(1 + (-ref_pos - 1) / bsize_pix);
-  else
-    round = ref_pos / bsize_pix;
-
-  return round;
-}
-
-static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, int stride) {
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
-  int idx, idy;
-
-  for (idy = 0; idy < mi_height; ++idy) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
-      const int64_t mc_flow = tpl_ptr->mc_flow;
-      const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
-      *tpl_ptr = *src_stats;
-      tpl_ptr->mc_flow = mc_flow;
-      tpl_ptr->mc_ref_cost = mc_ref_cost;
-      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+        force_src_buffer = &cpi->alt_ref_buffer;
+      }
+#endif
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_last_frame = 0;
+      rc->is_src_frame_alt_ref = 0;
+      rc->source_alt_ref_pending = 0;
+    } else {
+      rc->source_alt_ref_pending = 0;
     }
   }
-}
 
-static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
-  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
-  MV mv = tpl_stats->mv.as_mv;
-  int mv_row = mv.row >> 3;
-  int mv_col = mv.col >> 3;
-
-  int ref_pos_row = mi_row * MI_SIZE + mv_row;
-  int ref_pos_col = mi_col * MI_SIZE + mv_col;
-
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int pix_num = bw * bh;
-
-  // top-left on grid block location in pixel
-  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
-  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
-  int block;
-
-  for (block = 0; block < 4; ++block) {
-    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
-    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
-
-    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
-        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
-      int overlap_area = get_overlap_area(
-          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
-      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
-      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
-
-      int64_t mc_flow = tpl_stats->mc_dep_cost -
-                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
-                            tpl_stats->intra_cost;
-
-      int idx, idy;
-
-      for (idy = 0; idy < mi_height; ++idy) {
-        for (idx = 0; idx < mi_width; ++idx) {
-          TplDepStats *des_stats =
-              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
-                         (ref_mi_col + idx)];
-
-          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
-          des_stats->mc_ref_cost +=
-              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
-              pix_num;
-          assert(overlap_area >= 0);
-        }
-      }
+  if (!source) {
+    // Get last frame source.
+    if (cm->current_video_frame > 0) {
+      if ((last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL)
+        return -1;
     }
-  }
-}
 
-static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  int idx, idy;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+    // Read in the source frame.
+    if (cpi->use_svc || cpi->svc.set_intra_only_frame)
+      source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
+    else
+      source = vp9_lookahead_pop(cpi->lookahead, flush);
+
+    if (source != NULL) {
+      cm->show_frame = 1;
+      cm->intra_only = 0;
+      // If the flags indicate intra frame, but if the current picture is for
+      // spatial layer above first_spatial_layer_to_encode, it should not be an
+      // intra picture.
+      if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->use_svc &&
+          cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
+        source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
+      }
 
-  for (idy = 0; idy < mi_height; ++idy) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr =
-          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
-      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
-                         BLOCK_8X8);
+      // Check to see if the frame should be encoded as an arf overlay.
+      check_src_altref(cpi, source);
     }
   }
-}
 
-static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
-                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                               TX_SIZE tx_size, int64_t *recon_error,
-                               int64_t *sse) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  uint16_t eob;
-  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
-  const int shift = tx_size == TX_32X32 ? 0 : 2;
+  if (source) {
+    cpi->un_scaled_source = cpi->Source =
+        force_src_buffer ? force_src_buffer : &source->img;
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
-                                 p->quant_fp, qcoeff, dqcoeff, pd->dequant,
-                                 &eob, scan_order->scan, scan_order->iscan);
-  } else {
-    vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
-                          p->quant_fp, qcoeff, dqcoeff, pd->dequant, &eob,
-                          scan_order->scan, scan_order->iscan);
-  }
-#else
-  vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, p->quant_fp,
-                        qcoeff, dqcoeff, pd->dequant, &eob, scan_order->scan,
-                        scan_order->iscan);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+#ifdef ENABLE_KF_DENOISE
+    // Copy of raw source for metrics calculation.
+    if (is_psnr_calc_enabled(cpi))
+      vp9_copy_and_extend_frame(cpi->Source, &cpi->raw_unscaled_source);
+#endif
 
-  *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
-  *recon_error = VPXMAX(*recon_error, 1);
-
-  *sse = (*sse) >> shift;
-  *sse = VPXMAX(*sse, 1);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                         TX_SIZE tx_size) {
-  // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
-  switch (tx_size) {
-    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
-    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
-    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                  TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
-    case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
-    case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
-    default: assert(0);
-  }
-}
-
-static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
-                          int mi_col) {
-  x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
-  x->mv_limits.row_max =
-      (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
-  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
-  x->mv_limits.col_max =
-      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
-}
-
-static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
-                            struct scale_factors *sf, GF_PICTURE *gf_picture,
-                            int frame_idx, TplDepFrame *tpl_frame,
-                            int16_t *src_diff, tran_low_t *coeff,
-                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
-                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
-                            int64_t *recon_error, int64_t *sse) {
-  VP9_COMMON *cm = &cpi->common;
-  ThreadData *td = &cpi->td;
-
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int pix_num = bw * bh;
-  int best_rf_idx = -1;
-  int_mv best_mv;
-  int64_t best_inter_cost = INT64_MAX;
-  int64_t inter_cost;
-  int rf_idx;
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
-
-  int64_t best_intra_cost = INT64_MAX;
-  int64_t intra_cost;
-  PREDICTION_MODE mode;
-  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  MODE_INFO mi_above, mi_left;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  TplDepStats *tpl_stats =
-      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
-
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
-  xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
-  xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
-
-  // Intra prediction search
-  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
-    uint8_t *src, *dst;
-    int src_stride, dst_stride;
-
-    src = xd->cur_buf->y_buffer + mb_y_offset;
-    src_stride = xd->cur_buf->y_stride;
-
-    dst = &predictor[0];
-    dst_stride = bw;
-
-    xd->mi[0]->sb_type = bsize;
-    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-
-    vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
-                            src_stride, dst, dst_stride, 0, 0, 0);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                                dst_stride, xd->bd);
-      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      intra_cost = vpx_highbd_satd(coeff, pix_num);
-    } else {
-      vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                         dst_stride);
-      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      intra_cost = vpx_satd(coeff, pix_num);
-    }
-#else
-    vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-    intra_cost = vpx_satd(coeff, pix_num);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
-  }
-
-  // Motion compensated prediction
-  best_mv.as_int = 0;
-
-  set_mv_limits(cm, x, mi_row, mi_col);
-
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    int_mv mv;
-#if CONFIG_NON_GREEDY_MV
-    MotionField *motion_field;
-#endif
-    if (ref_frame[rf_idx] == NULL) continue;
-
-#if CONFIG_NON_GREEDY_MV
-    (void)td;
-    motion_field = vp9_motion_field_info_get_motion_field(
-        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-    mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-#else
-    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
-                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                                  xd->cur_buf->y_stride, bsize, &mv.as_mv);
-#endif
-
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      vp9_highbd_build_inter_predictor(
-          CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
-          ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
-          &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
-          mi_row * MI_SIZE, xd->bd);
-      vpx_highbd_subtract_block(
-          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
-          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
-      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      inter_cost = vpx_highbd_satd(coeff, pix_num);
-    } else {
-      vp9_build_inter_predictor(
-          ref_frame[rf_idx]->y_buffer + mb_y_offset,
-          ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
-          0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
-      vpx_subtract_block(bh, bw, src_diff, bw,
-                         xd->cur_buf->y_buffer + mb_y_offset,
-                         xd->cur_buf->y_stride, &predictor[0], bw);
-      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      inter_cost = vpx_satd(coeff, pix_num);
-    }
-#else
-    vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                              ref_frame[rf_idx]->y_stride, &predictor[0], bw,
-                              &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
-                              mi_col * MI_SIZE, mi_row * MI_SIZE);
-    vpx_subtract_block(bh, bw, src_diff, bw,
-                       xd->cur_buf->y_buffer + mb_y_offset,
-                       xd->cur_buf->y_stride, &predictor[0], bw);
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-    inter_cost = vpx_satd(coeff, pix_num);
-#endif
-
-    if (inter_cost < best_inter_cost) {
-      best_rf_idx = rf_idx;
-      best_inter_cost = inter_cost;
-      best_mv.as_int = mv.as_int;
-      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
-                         sse);
-    }
-  }
-  best_intra_cost = VPXMAX(best_intra_cost, 1);
-  best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
-  tpl_stats->inter_cost = VPXMAX(
-      1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
-  tpl_stats->intra_cost = VPXMAX(
-      1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
-  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
-  tpl_stats->mv.as_int = best_mv.as_int;
-}
-
-#if CONFIG_NON_GREEDY_MV
-static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
-                                  int frame_idx, int rf_idx, int mi_row,
-                                  int mi_col, struct buf_2d *src,
-                                  struct buf_2d *pre) {
-  const int mb_y_offset =
-      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  YV12_BUFFER_CONFIG *ref_frame = NULL;
-  int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-  if (ref_frame_idx != -1) {
-    ref_frame = gf_picture[ref_frame_idx].frame;
-    src->buf = xd->cur_buf->y_buffer + mb_y_offset;
-    src->stride = xd->cur_buf->y_stride;
-    pre->buf = ref_frame->y_buffer + mb_y_offset;
-    pre->stride = ref_frame->y_stride;
-    assert(src->stride == pre->stride);
-    return 1;
-  } else {
-    printf("invalid ref_frame_idx");
-    assert(ref_frame_idx != -1);
-    return 0;
-  }
-}
-
-#define kMvPreCheckLines 5
-#define kMvPreCheckSize 15
-
-#define MV_REF_POS_NUM 3
-POSITION mv_ref_pos[MV_REF_POS_NUM] = {
-  { -1, 0 },
-  { 0, -1 },
-  { -1, -1 },
-};
-
-static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
-                             int mi_col) {
-  return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
-}
-
-static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
-                          BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  int i;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int_mv nearest_mv, near_mv, invalid_mv;
-  nearest_mv.as_int = INVALID_MV;
-  near_mv.as_int = INVALID_MV;
-  invalid_mv.as_int = INVALID_MV;
-  for (i = 0; i < MV_REF_POS_NUM; ++i) {
-    int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
-    int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
-    assert(mv_ref_pos[i].row <= 0);
-    assert(mv_ref_pos[i].col <= 0);
-    if (nb_row >= 0 && nb_col >= 0) {
-      if (nearest_mv.as_int == INVALID_MV) {
-        nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
-      } else {
-        int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
-        if (mv.as_int == nearest_mv.as_int) {
-          continue;
-        } else {
-          near_mv = mv;
-          break;
-        }
-      }
-    }
-  }
-  if (nearest_mv.as_int == INVALID_MV) {
-    nearest_mv.as_mv.row = 0;
-    nearest_mv.as_mv.col = 0;
-  }
-  if (near_mv.as_int == INVALID_MV) {
-    near_mv.as_mv.row = 0;
-    near_mv.as_mv.col = 0;
-  }
-  if (mv_mode == NEAREST_MV_MODE) {
-    return nearest_mv;
-  }
-  if (mv_mode == NEAR_MV_MODE) {
-    return near_mv;
-  }
-  assert(0);
-  return invalid_mv;
-}
-
-static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
-                                  MotionField *motion_field,
-                                  TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
-                                  int mi_row, int mi_col) {
-  int_mv mv;
-  switch (mv_mode) {
-    case ZERO_MV_MODE:
-      mv.as_mv.row = 0;
-      mv.as_mv.col = 0;
-      break;
-    case NEW_MV_MODE:
-      mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-      break;
-    case NEAREST_MV_MODE:
-      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
-      break;
-    case NEAR_MV_MODE:
-      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
-      break;
-    default:
-      mv.as_int = INVALID_MV;
-      assert(0);
-      break;
-  }
-  return mv;
-}
-
-static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
-                          GF_PICTURE *gf_picture, MotionField *motion_field,
-                          int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
-                          BLOCK_SIZE bsize, int mi_row, int mi_col,
-                          int_mv *mv) {
-  uint32_t sse;
-  struct buf_2d src;
-  struct buf_2d pre;
-  MV full_mv;
-  *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
-                            mi_row, mi_col);
-  full_mv = get_full_mv(&mv->as_mv);
-  if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
-                             &src, &pre)) {
-    // TODO(angiebird): Consider subpixel when computing the sse.
-    cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
-                          pre.stride, &sse);
-    return (double)(sse << VP9_DIST_SCALE_LOG2);
-  } else {
-    assert(0);
-    return 0;
-  }
-}
-
-static int get_mv_mode_cost(int mv_mode) {
-  // TODO(angiebird): The probabilities are roughly inferred from
-  // default_inter_mode_probs. Check if there is a better way to set the
-  // probabilities.
-  const int zero_mv_prob = 16;
-  const int new_mv_prob = 24 * 1;
-  const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
-  assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
-  switch (mv_mode) {
-    case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
-    case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
-    case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
-    case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
-    default: assert(0); return -1;
-  }
-}
-
-static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
-  double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
-                        log2(1 + abs(new_mv->col - ref_mv->col));
-  mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
-  return mv_diff_cost;
-}
-static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
-                          TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
-                          int mi_col) {
-  double mv_cost = get_mv_mode_cost(mv_mode);
-  if (mv_mode == NEW_MV_MODE) {
-    MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
-                                    bsize, mi_row, mi_col)
-                    .as_mv;
-    MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
-                                        tpl_frame, bsize, mi_row, mi_col)
-                        .as_mv;
-    MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
-                                     bsize, mi_row, mi_col)
-                     .as_mv;
-    double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
-    double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
-    mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
-  }
-  return mv_cost;
-}
-
-static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
-                           GF_PICTURE *gf_picture, MotionField *motion_field,
-                           int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
-                           BLOCK_SIZE bsize, int mi_row, int mi_col,
-                           int_mv *mv) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  double mv_dist =
-      get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
-                  tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
-  double mv_cost =
-      get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
-  double mult = 180;
-
-  return mv_cost + mult * log2f(1 + mv_dist);
-}
-
-static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                 GF_PICTURE *gf_picture,
-                                 MotionField *motion_field, int frame_idx,
-                                 TplDepFrame *tpl_frame, int rf_idx,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                 double *rd, int_mv *mv) {
-  int best_mv_mode = ZERO_MV_MODE;
-  int update = 0;
-  int mv_mode;
-  *rd = 0;
-  for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
-    double this_rd;
-    int_mv this_mv;
-    if (mv_mode == NEW_MV_MODE) {
-      continue;
-    }
-    this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
-                           tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
-    if (update == 0) {
-      *rd = this_rd;
-      *mv = this_mv;
-      best_mv_mode = mv_mode;
-      update = 1;
-    } else {
-      if (this_rd < *rd) {
-        *rd = this_rd;
-        *mv = this_mv;
-        best_mv_mode = mv_mode;
-      }
-    }
-  }
-  return best_mv_mode;
-}
-
-static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            GF_PICTURE *gf_picture, MotionField *motion_field,
-                            int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
-                            BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int tmp_mv_mode_arr[kMvPreCheckSize];
-  int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
-  double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
-  int_mv *select_mv_arr = cpi->select_mv_arr;
-  int_mv tmp_select_mv_arr[kMvPreCheckSize];
-  int stride = tpl_frame->stride;
-  double new_mv_rd = 0;
-  double no_new_mv_rd = 0;
-  double this_new_mv_rd = 0;
-  double this_no_new_mv_rd = 0;
-  int idx;
-  int tmp_idx;
-  assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
-
-  // no new mv
-  // diagonal scan order
-  tmp_idx = 0;
-  for (idx = 0; idx < kMvPreCheckLines; ++idx) {
-    int r;
-    for (r = 0; r <= idx; ++r) {
-      int c = idx - r;
-      int nb_row = mi_row + r * mi_height;
-      int nb_col = mi_col + c * mi_width;
-      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
-        double this_rd;
-        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
-        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
-            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
-            bsize, nb_row, nb_col, &this_rd, mv);
-        if (r == 0 && c == 0) {
-          this_no_new_mv_rd = this_rd;
-        }
-        no_new_mv_rd += this_rd;
-        tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
-        tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
-        ++tmp_idx;
-      }
-    }
-  }
-
-  // new mv
-  mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
-  this_new_mv_rd = eval_mv_mode(
-      NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
-      rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
-  new_mv_rd = this_new_mv_rd;
-  // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
-  // beforehand.
-  for (idx = 1; idx < kMvPreCheckLines; ++idx) {
-    int r;
-    for (r = 0; r <= idx; ++r) {
-      int c = idx - r;
-      int nb_row = mi_row + r * mi_height;
-      int nb_col = mi_col + c * mi_width;
-      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
-        double this_rd;
-        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
-        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
-            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
-            bsize, nb_row, nb_col, &this_rd, mv);
-        new_mv_rd += this_rd;
-      }
-    }
-  }
-
-  // update best_mv_mode
-  tmp_idx = 0;
-  if (no_new_mv_rd < new_mv_rd) {
-    for (idx = 0; idx < kMvPreCheckLines; ++idx) {
-      int r;
-      for (r = 0; r <= idx; ++r) {
-        int c = idx - r;
-        int nb_row = mi_row + r * mi_height;
-        int nb_col = mi_col + c * mi_width;
-        if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
-          mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
-          select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
-          ++tmp_idx;
-        }
-      }
-    }
-    rd_diff_arr[mi_row * stride + mi_col] = 0;
-  } else {
-    rd_diff_arr[mi_row * stride + mi_col] =
-        (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
-  }
-}
-
-static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
-                                GF_PICTURE *gf_picture,
-                                MotionField *motion_field, int frame_idx,
-                                TplDepFrame *tpl_frame, int rf_idx,
-                                BLOCK_SIZE bsize) {
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int unit_rows = tpl_frame->mi_rows / mi_height;
-  const int unit_cols = tpl_frame->mi_cols / mi_width;
-  const int max_diagonal_lines = unit_rows + unit_cols - 1;
-  int idx;
-  for (idx = 0; idx < max_diagonal_lines; ++idx) {
-    int r;
-    for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
-         ++r) {
-      int c = idx - r;
-      int mi_row = r * mi_height;
-      int mi_col = c * mi_width;
-      assert(c >= 0 && c < unit_cols);
-      assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
-      assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
-      predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
-                      rf_idx, bsize, mi_row, mi_col);
-    }
-  }
-}
-
-static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
-                             MotionField *motion_field, int frame_idx,
-                             YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
-                             int mi_row, int mi_col) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int mb_y_offset =
-      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  assert(ref_frame != NULL);
-  set_mv_limits(cm, x, mi_row, mi_col);
-  {
-    int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-    uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
-    uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
-    const int stride = xd->cur_buf->y_stride;
-    full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
-                             ref_frame_buf, stride, bsize, mi_row, mi_col,
-                             &mv.as_mv);
-    sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
-                            bsize, &mv.as_mv);
-    vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
-  }
-}
-
-static void build_motion_field(
-    VP9_COMP *cpi, int frame_idx,
-    YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
-  VP9_COMMON *cm = &cpi->common;
-  ThreadData *td = &cpi->td;
-  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
-  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
-  int mi_row, mi_col;
-  int rf_idx;
-
-  tpl_frame->lambda = (pw * ph) >> 2;
-  assert(pw * ph == tpl_frame->lambda << 2);
-
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    MotionField *motion_field = vp9_motion_field_info_get_motion_field(
-        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-    if (ref_frame[rf_idx] == NULL) {
-      continue;
-    }
-    vp9_motion_field_reset_mvs(motion_field);
-    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-        do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
-                         bsize, mi_row, mi_col);
-      }
-    }
-  }
-}
-#endif  // CONFIG_NON_GREEDY_MV
-
-static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
-                              int frame_idx, BLOCK_SIZE bsize) {
-  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
-  YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
-
-  VP9_COMMON *cm = &cpi->common;
-  struct scale_factors sf;
-  int rdmult, idx;
-  ThreadData *td = &cpi->td;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int mi_row, mi_col;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
-  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
-  uint8_t *predictor;
-#else
-  DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
-#endif
-  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int64_t recon_error, sse;
-#if CONFIG_NON_GREEDY_MV
-  int square_block_idx;
-  int rf_idx;
-#endif
-
-  // Setup scaling factor
-#if CONFIG_VP9_HIGHBITDEPTH
-  vp9_setup_scale_factors_for_frame(
-      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
-      this_frame->y_crop_width, this_frame->y_crop_height,
-      cpi->common.use_highbitdepth);
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    predictor = CONVERT_TO_BYTEPTR(predictor16);
-  else
-    predictor = predictor8;
-#else
-  vp9_setup_scale_factors_for_frame(
-      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
-      this_frame->y_crop_width, this_frame->y_crop_height);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  // Prepare reference frame pointers. If any reference frame slot is
-  // unavailable, the pointer will be set to Null.
-  for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
-    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
-    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
-  }
-
-  xd->mi = cm->mi_grid_visible;
-  xd->mi[0] = cm->mi;
-  xd->cur_buf = this_frame;
-
-  // Get rd multiplier set up.
-  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
-  set_error_per_bit(&cpi->td.mb, rdmult);
-  vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
-
-  tpl_frame->is_valid = 1;
-
-  cm->base_qindex = tpl_frame->base_qindex;
-  vp9_frame_init_quantizer(cpi);
-
-#if CONFIG_NON_GREEDY_MV
-  for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
-       ++square_block_idx) {
-    BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
-    build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
-  }
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-    if (ref_frame_idx != -1) {
-      MotionField *motion_field = vp9_motion_field_info_get_motion_field(
-          &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-      predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
-                          tpl_frame, rf_idx, bsize);
-    }
-  }
-#endif
-
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
-                      src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
-                      tx_size, ref_frame, predictor, &recon_error, &sse);
-      // Motion flow dependency dispenser.
-      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
-                      tpl_frame->stride);
-
-      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
-                       bsize);
-    }
-  }
-}
-
-#if CONFIG_NON_GREEDY_MV
-#define DUMP_TPL_STATS 0
-#if DUMP_TPL_STATS
-static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
-  int i, j;
-  printf("%d %d\n", h, w);
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      printf("%d ", buf[(row + i) * stride + col + j]);
-    }
-  }
-  printf("\n");
-}
-
-static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
-  dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
-           frame_buf->y_width);
-  dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
-           frame_buf->uv_height, frame_buf->uv_width);
-  dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
-           frame_buf->uv_height, frame_buf->uv_width);
-}
-
-static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
-                           const GF_GROUP *gf_group,
-                           const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
-  int frame_idx;
-  const VP9_COMMON *cm = &cpi->common;
-  int rf_idx;
-  for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
-    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-      const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-      int mi_row, mi_col;
-      int ref_frame_idx;
-      const int mi_height = num_8x8_blocks_high_lookup[bsize];
-      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-      ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-      if (ref_frame_idx != -1) {
-        YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
-        const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
-        const int ref_gf_frame_offset =
-            gf_group->frame_gop_index[ref_frame_idx];
-        printf("=\n");
-        printf(
-            "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
-            "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
-            frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
-            ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
-        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
-          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
-              int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
-                                                       frame_idx, rf_idx, bsize,
-                                                       mi_row, mi_col);
-              printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
-                     mv.as_mv.col);
-            }
-          }
-        }
-        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
-          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
-              const TplDepStats *tpl_ptr =
-                  &tpl_frame
-                       ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
-              printf("%f ", tpl_ptr->feature_score);
-            }
-          }
-        }
-        printf("\n");
-
-        for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-          for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-            const int mv_mode =
-                tpl_frame
-                    ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
-            printf("%d ", mv_mode);
-          }
-        }
-        printf("\n");
-
-        dump_frame_buf(gf_picture[frame_idx].frame);
-        dump_frame_buf(ref_frame_buf);
-      }
-    }
-  }
-}
-#endif  // DUMP_TPL_STATS
-#endif  // CONFIG_NON_GREEDY_MV
-
-static void init_tpl_buffer(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int frame;
-
-  const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
-#if CONFIG_NON_GREEDY_MV
-  int rf_idx;
-
-  vpx_free(cpi->select_mv_arr);
-  CHECK_MEM_ERROR(
-      cm, cpi->select_mv_arr,
-      vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
-#endif
-
-  // TODO(jingning): Reduce the actual memory use for tpl model build up.
-  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
-    if (cpi->tpl_stats[frame].width >= mi_cols &&
-        cpi->tpl_stats[frame].height >= mi_rows &&
-        cpi->tpl_stats[frame].tpl_stats_ptr)
-      continue;
-
-#if CONFIG_NON_GREEDY_MV
-    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
-      CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
-          vpx_calloc(mi_rows * mi_cols * 4,
-                     sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
-      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
-      CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
-          vpx_calloc(mi_rows * mi_cols * 4,
-                     sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
-    }
-#endif
-    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
-    CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
-                    vpx_calloc(mi_rows * mi_cols,
-                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
-    cpi->tpl_stats[frame].is_valid = 0;
-    cpi->tpl_stats[frame].width = mi_cols;
-    cpi->tpl_stats[frame].height = mi_rows;
-    cpi->tpl_stats[frame].stride = mi_cols;
-    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
-    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
-  }
-
-  for (frame = 0; frame < REF_FRAMES; ++frame) {
-    cpi->enc_frame_buf[frame].mem_valid = 0;
-    cpi->enc_frame_buf[frame].released = 1;
-  }
-}
-
-static void free_tpl_buffer(VP9_COMP *cpi) {
-  int frame;
-#if CONFIG_NON_GREEDY_MV
-  vp9_free_motion_field_info(&cpi->motion_field_info);
-  vpx_free(cpi->select_mv_arr);
-#endif
-  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
-#if CONFIG_NON_GREEDY_MV
-    int rf_idx;
-    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
-      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
-    }
-#endif
-    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
-    cpi->tpl_stats[frame].is_valid = 0;
-  }
-}
-
-static void setup_tpl_stats(VP9_COMP *cpi) {
-  GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  int tpl_group_frames = 0;
-  int frame_idx;
-  cpi->tpl_bsize = BLOCK_32X32;
-
-  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
-
-  init_tpl_stats(cpi);
-
-  // Backward propagation from tpl_group_frames to 1.
-  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
-    if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
-    mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
-  }
-#if CONFIG_NON_GREEDY_MV
-  cpi->tpl_ready = 1;
-#if DUMP_TPL_STATS
-  dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
-#endif  // DUMP_TPL_STATS
-#endif  // CONFIG_NON_GREEDY_MV
-}
-
-#if !CONFIG_REALTIME_ONLY
-#if CONFIG_RATE_CTRL
-static void copy_frame_counts(const FRAME_COUNTS *input_counts,
-                              FRAME_COUNTS *output_counts) {
-  int i, j, k, l, m, n;
-  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
-    for (j = 0; j < INTRA_MODES; ++j) {
-      output_counts->y_mode[i][j] = input_counts->y_mode[i][j];
-    }
-  }
-  for (i = 0; i < INTRA_MODES; ++i) {
-    for (j = 0; j < INTRA_MODES; ++j) {
-      output_counts->uv_mode[i][j] = input_counts->uv_mode[i][j];
-    }
-  }
-  for (i = 0; i < PARTITION_CONTEXTS; ++i) {
-    for (j = 0; j < PARTITION_TYPES; ++j) {
-      output_counts->partition[i][j] = input_counts->partition[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZES; ++i) {
-    for (j = 0; j < PLANE_TYPES; ++j) {
-      for (k = 0; k < REF_TYPES; ++k) {
-        for (l = 0; l < COEF_BANDS; ++l) {
-          for (m = 0; m < COEFF_CONTEXTS; ++m) {
-            output_counts->eob_branch[i][j][k][l][m] =
-                input_counts->eob_branch[i][j][k][l][m];
-            for (n = 0; n < UNCONSTRAINED_NODES + 1; ++n) {
-              output_counts->coef[i][j][k][l][m][n] =
-                  input_counts->coef[i][j][k][l][m][n];
-            }
-          }
-        }
-      }
-    }
-  }
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
-    for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
-      output_counts->switchable_interp[i][j] =
-          input_counts->switchable_interp[i][j];
-    }
-  }
-  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
-    for (j = 0; j < INTER_MODES; ++j) {
-      output_counts->inter_mode[i][j] = input_counts->inter_mode[i][j];
-    }
-  }
-  for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->intra_inter[i][j] = input_counts->intra_inter[i][j];
-    }
-  }
-  for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->comp_inter[i][j] = input_counts->comp_inter[i][j];
-    }
-  }
-  for (i = 0; i < REF_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      for (k = 0; k < 2; ++k) {
-        output_counts->single_ref[i][j][k] = input_counts->single_ref[i][j][k];
-      }
-    }
-  }
-  for (i = 0; i < REF_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->comp_ref[i][j] = input_counts->comp_ref[i][j];
-    }
-  }
-  for (i = 0; i < SKIP_CONTEXTS; ++i) {
-    for (j = 0; j < 2; ++j) {
-      output_counts->skip[i][j] = input_counts->skip[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES; j++) {
-      output_counts->tx.p32x32[i][j] = input_counts->tx.p32x32[i][j];
-    }
-    for (j = 0; j < TX_SIZES - 1; j++) {
-      output_counts->tx.p16x16[i][j] = input_counts->tx.p16x16[i][j];
-    }
-    for (j = 0; j < TX_SIZES - 2; j++) {
-      output_counts->tx.p8x8[i][j] = input_counts->tx.p8x8[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZES; i++) {
-    output_counts->tx.tx_totals[i] = input_counts->tx.tx_totals[i];
-  }
-  for (i = 0; i < MV_JOINTS; i++) {
-    output_counts->mv.joints[i] = input_counts->mv.joints[i];
-  }
-  for (k = 0; k < 2; k++) {
-    nmv_component_counts *const comps = &output_counts->mv.comps[k];
-    const nmv_component_counts *const comps_t = &input_counts->mv.comps[k];
-    for (i = 0; i < 2; i++) {
-      comps->sign[i] = comps_t->sign[i];
-      comps->class0_hp[i] = comps_t->class0_hp[i];
-      comps->hp[i] = comps_t->hp[i];
-    }
-    for (i = 0; i < MV_CLASSES; i++) {
-      comps->classes[i] = comps_t->classes[i];
-    }
-    for (i = 0; i < CLASS0_SIZE; i++) {
-      comps->class0[i] = comps_t->class0[i];
-      for (j = 0; j < MV_FP_SIZE; j++) {
-        comps->class0_fp[i][j] = comps_t->class0_fp[i][j];
-      }
-    }
-    for (i = 0; i < MV_OFFSET_BITS; i++) {
-      for (j = 0; j < 2; j++) {
-        comps->bits[i][j] = comps_t->bits[i][j];
-      }
-    }
-    for (i = 0; i < MV_FP_SIZE; i++) {
-      comps->fp[i] = comps_t->fp[i];
-    }
-  }
-}
-
-static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer,
-                                        IMAGE_BUFFER *image_buffer) {
-  const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer,
-                                   yv12_buffer->v_buffer };
-  const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride,
-                                 yv12_buffer->uv_stride };
-  const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width,
-                        yv12_buffer->uv_crop_width };
-  const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height,
-                        yv12_buffer->uv_crop_height };
-  int plane;
-  for (plane = 0; plane < 3; ++plane) {
-    const int src_stride = src_stride_ls[plane];
-    const int w = w_ls[plane];
-    const int h = h_ls[plane];
-    const uint8_t *src_buf = src_buf_ls[plane];
-    uint8_t *dst_buf = image_buffer->plane_buffer[plane];
-    int r;
-    assert(image_buffer->plane_width[plane] == w);
-    assert(image_buffer->plane_height[plane] == h);
-    for (r = 0; r < h; ++r) {
-      memcpy(dst_buf, src_buf, sizeof(*src_buf) * w);
-      src_buf += src_stride;
-      dst_buf += w;
-    }
-  }
-}
-#endif  // CONFIG_RATE_CTRL
-static void update_encode_frame_result(
-    int ref_frame_flags, FRAME_UPDATE_TYPE update_type,
-    const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf,
-    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index,
-    uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts,
-#if CONFIG_RATE_CTRL
-    const PARTITION_INFO *partition_info,
-    const MOTION_VECTOR_INFO *motion_vector_info,
-#endif  // CONFIG_RATE_CTRL
-    ENCODE_FRAME_RESULT *encode_frame_result) {
-#if CONFIG_RATE_CTRL
-  PSNR_STATS psnr;
-#if CONFIG_VP9_HIGHBITDEPTH
-  vpx_calc_highbd_psnr(source_frame, coded_frame_buf->buf, &psnr, bit_depth,
-                       input_bit_depth);
-#else   // CONFIG_VP9_HIGHBITDEPTH
-  (void)bit_depth;
-  (void)input_bit_depth;
-  vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index;
-
-  if (update_type != KF_UPDATE) {
-    const VP9_REFFRAME inter_ref_flags[MAX_INTER_REF_FRAMES] = { VP9_LAST_FLAG,
-                                                                 VP9_GOLD_FLAG,
-                                                                 VP9_ALT_FLAG };
-    int i;
-    for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) {
-      assert(ref_frame_bufs[i] != NULL);
-      encode_frame_result->ref_frame_coding_indexes[i] =
-          ref_frame_bufs[i]->frame_coding_index;
-      encode_frame_result->ref_frame_valid_list[i] =
-          (ref_frame_flags & inter_ref_flags[i]) != 0;
-    }
-  } else {
-    // No reference frame is available when this is a key frame.
-    int i;
-    for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) {
-      encode_frame_result->ref_frame_coding_indexes[i] = -1;
-      encode_frame_result->ref_frame_valid_list[i] = 0;
-    }
-  }
-  encode_frame_result->psnr = psnr.psnr[0];
-  encode_frame_result->sse = psnr.sse[0];
-  copy_frame_counts(counts, &encode_frame_result->frame_counts);
-  encode_frame_result->partition_info = partition_info;
-  encode_frame_result->motion_vector_info = motion_vector_info;
-  if (encode_frame_result->coded_frame.allocated) {
-    yv12_buffer_to_image_buffer(&coded_frame_buf->buf,
-                                &encode_frame_result->coded_frame);
-  }
-#else   // CONFIG_RATE_CTRL
-  (void)ref_frame_flags;
-  (void)bit_depth;
-  (void)input_bit_depth;
-  (void)source_frame;
-  (void)coded_frame_buf;
-  (void)ref_frame_bufs;
-  (void)counts;
-#endif  // CONFIG_RATE_CTRL
-  encode_frame_result->show_idx = coded_frame_buf->frame_index;
-  encode_frame_result->update_type = update_type;
-  encode_frame_result->quantize_index = quantize_index;
-}
-#endif  // !CONFIG_REALTIME_ONLY
-
-void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) {
-  encode_frame_result->show_idx = -1;  // Actual encoding doesn't happen.
-#if CONFIG_RATE_CTRL
-  encode_frame_result->frame_coding_index = -1;
-  vp9_zero(encode_frame_result->coded_frame);
-  encode_frame_result->coded_frame.allocated = 0;
-#endif  // CONFIG_RATE_CTRL
-}
-
-int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
-                            size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush,
-                            ENCODE_FRAME_RESULT *encode_frame_result) {
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  VP9_COMMON *const cm = &cpi->common;
-  BufferPool *const pool = cm->buffer_pool;
-  RATE_CONTROL *const rc = &cpi->rc;
-  struct vpx_usec_timer cmptimer;
-  YV12_BUFFER_CONFIG *force_src_buffer = NULL;
-  struct lookahead_entry *last_source = NULL;
-  struct lookahead_entry *source = NULL;
-  int arf_src_index;
-  const int gf_group_index = cpi->twopass.gf_group.index;
-  int i;
-
-  if (is_one_pass_cbr_svc(cpi)) {
-    vp9_one_pass_cbr_svc_start_layer(cpi);
-  }
-
-  vpx_usec_timer_start(&cmptimer);
-
-  vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
-
-  // Is multi-arf enabled.
-  // Note that at the moment multi_arf is only configured for 2 pass VBR and
-  // will not work properly with svc.
-  // Enable the Jingning's new "multi_layer_arf" code if "enable_auto_arf"
-  // is greater than or equal to 2.
-  if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf >= 2))
-    cpi->multi_layer_arf = 1;
-  else
-    cpi->multi_layer_arf = 0;
-
-  // Normal defaults
-  cm->reset_frame_context = 0;
-  cm->refresh_frame_context = 1;
-  if (!is_one_pass_cbr_svc(cpi)) {
-    cpi->refresh_last_frame = 1;
-    cpi->refresh_golden_frame = 0;
-    cpi->refresh_alt_ref_frame = 0;
-  }
-
-  // Should we encode an arf frame.
-  arf_src_index = get_arf_src_index(cpi);
-
-  if (arf_src_index) {
-    for (i = 0; i <= arf_src_index; ++i) {
-      struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i);
-      // Avoid creating an alt-ref if there's a forced keyframe pending.
-      if (e == NULL) {
-        break;
-      } else if (e->flags == VPX_EFLAG_FORCE_KF) {
-        arf_src_index = 0;
-        flush = 1;
-        break;
-      }
-    }
-  }
-
-  // Clear arf index stack before group of pictures processing starts.
-  if (gf_group_index == 1) {
-    stack_init(cpi->twopass.gf_group.arf_index_stack, MAX_LAG_BUFFERS * 2);
-    cpi->twopass.gf_group.stack_size = 0;
-  }
-
-  if (arf_src_index) {
-    assert(arf_src_index <= rc->frames_to_key);
-    if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
-      cpi->alt_ref_source = source;
-
-#if !CONFIG_REALTIME_ONLY
-      if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) &&
-          (oxcf->arnr_strength > 0)) {
-        int bitrate = cpi->rc.avg_frame_bandwidth / 40;
-        int not_low_bitrate = bitrate > ALT_REF_AQ_LOW_BITRATE_BOUNDARY;
-
-        int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1);
-        not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME;
-
-        // Produce the filtered ARF frame.
-        vp9_temporal_filter(cpi, arf_src_index);
-        vpx_extend_frame_borders(&cpi->alt_ref_buffer);
-
-        // for small bitrates segmentation overhead usually
-        // eats all bitrate gain from enabling delta quantizers
-        if (cpi->oxcf.alt_ref_aq != 0 && not_low_bitrate && not_last_frame)
-          vp9_alt_ref_aq_setup_mode(cpi->alt_ref_aq, cpi);
-
-        force_src_buffer = &cpi->alt_ref_buffer;
-      }
-#endif
-      cm->show_frame = 0;
-      cm->intra_only = 0;
-      cpi->refresh_alt_ref_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_last_frame = 0;
-      rc->is_src_frame_alt_ref = 0;
-      rc->source_alt_ref_pending = 0;
-    } else {
-      rc->source_alt_ref_pending = 0;
-    }
-  }
-
-  if (!source) {
-    // Get last frame source.
-    if (cm->current_video_frame > 0) {
-      if ((last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL)
-        return -1;
-    }
-
-    // Read in the source frame.
-    if (cpi->use_svc || cpi->svc.set_intra_only_frame)
-      source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
-    else
-      source = vp9_lookahead_pop(cpi->lookahead, flush);
-
-    if (source != NULL) {
-      cm->show_frame = 1;
-      cm->intra_only = 0;
-      // If the flags indicate intra frame, but if the current picture is for
-      // spatial layer above first_spatial_layer_to_encode, it should not be an
-      // intra picture.
-      if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->use_svc &&
-          cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
-        source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
-      }
-
-      // Check to see if the frame should be encoded as an arf overlay.
-      check_src_altref(cpi, source);
-    }
-  }
-
-  if (source) {
-    cpi->un_scaled_source = cpi->Source =
-        force_src_buffer ? force_src_buffer : &source->img;
-
-#ifdef ENABLE_KF_DENOISE
-    // Copy of raw source for metrics calculation.
-    if (is_psnr_calc_enabled(cpi))
-      vp9_copy_and_extend_frame(cpi->Source, &cpi->raw_unscaled_source);
-#endif
-
-    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
 
     *time_stamp = source->ts_start;
     *time_end = source->ts_end;
@@ -7566,7 +6538,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       adjust_frame_rate(cpi, source);
   }
 
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     vp9_update_temporal_layer_framerate(cpi);
     vp9_restore_layer_context(cpi);
   }
@@ -7579,9 +6551,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   cm->new_fb_idx = get_free_fb(cm);
 
   if (cm->new_fb_idx == INVALID_IDX) return -1;
-
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-
+  // If the frame buffer for current frame is the same as previous frame, MV in
+  // the base layer shouldn't be used as it'll cause data race.
+  if (cpi->svc.spatial_layer_id > 0 && cm->cur_frame == cm->prev_frame) {
+    cpi->svc.use_base_mv = 0;
+  }
   // Start with a 0 size frame.
   *size = 0;
 
@@ -7589,7 +6564,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #if !CONFIG_REALTIME_ONLY
   if ((oxcf->pass == 2) && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
     vp9_rc_get_second_pass_params(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
   } else if (oxcf->pass == 1) {
     set_frame_size(cpi);
   }
@@ -7610,7 +6591,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     pthread_mutex_init(&cpi->kmeans_mutex, NULL);
 #endif
     CHECK_MEM_ERROR(
-        cm, cpi->kmeans_data_arr,
+        &cm->error, cpi->kmeans_data_arr,
         vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr)));
     cpi->kmeans_data_stride = mi_cols;
     cpi->kmeans_data_arr_alloc = 1;
@@ -7629,13 +6610,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 #endif  // CONFIG_NON_GREEDY_MV
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, setup_tpl_stats_time);
+#endif
   if (gf_group_index == 1 &&
       cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
       cpi->sf.enable_tpl_model) {
-    init_tpl_buffer(cpi);
+    vp9_init_tpl_buffer(cpi);
     vp9_estimate_qp_gop(cpi);
-    setup_tpl_stats(cpi);
+    vp9_setup_tpl_stats(cpi);
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, setup_tpl_stats_time);
+#endif
 
 #if CONFIG_BITSTREAM_DEBUG
   assert(cpi->oxcf.max_threads == 0 &&
@@ -7672,8 +6659,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
   } else if (oxcf->pass == 2 && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    // Accumulate 2nd pass time in 2-pass case.
+    start_timing(cpi, Pass2Encode_time);
+#endif
     Pass2Encode(cpi, size, dest, frame_flags, encode_frame_result);
     vp9_twopass_postencode_update(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, Pass2Encode_time);
+#endif
   } else if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
   } else {
@@ -7697,12 +6691,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   // Save layer specific state.
-  if (is_one_pass_cbr_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
-                                    cpi->svc.number_spatial_layers > 1) &&
-                                   oxcf->pass == 2)) {
+  if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
+                                cpi->svc.number_spatial_layers > 1) &&
+                               oxcf->pass == 2)) {
     vp9_save_layer_context(cpi);
   }
 
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->fixed_qp_onepass = 0;
+
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
@@ -7711,7 +6708,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #if CONFIG_INTERNAL_STATS
 
-  if (oxcf->pass != 1) {
+  if (oxcf->pass != 1 && !cpi->last_frame_dropped) {
     double samples = 0.0;
     cpi->bytes += (int)(*size);
 
@@ -7873,7 +6870,42 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #endif
 
-  if (is_one_pass_cbr_svc(cpi)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time);
+
+  // Print out timing information.
+  // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+  // show_existing_frame and lag-in-frames.
+  //  if (cpi->frame_component_time[0] > 100)
+  if (oxcf->pass == 2) {
+    uint64_t frame_total = 0, total = 0;
+    int i;
+
+    fprintf(stderr,
+            "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n",
+            cm->current_video_frame, get_frame_type_enum(cm->frame_type),
+            cm->show_frame, cm->base_qindex);
+    for (i = 0; i < kTimingComponents; i++) {
+      cpi->component_time[i] += cpi->frame_component_time[i];
+      // Use vp9_get_compressed_data_time (i = 0) as the total time.
+      if (i == 0) {
+        frame_total = cpi->frame_component_time[0];
+        total = cpi->component_time[0];
+      }
+      fprintf(stderr,
+              " %50s:  %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+              " us [%6.2f%%])\n",
+              get_component_name(i), cpi->frame_component_time[i],
+              (float)((float)cpi->frame_component_time[i] * 100.0 /
+                      (float)frame_total),
+              cpi->component_time[i],
+              (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+      cpi->frame_component_time[i] = 0;
+    }
+  }
+#endif
+
+  if (is_one_pass_svc(cpi)) {
     if (cm->show_frame) {
       ++cpi->svc.spatial_layer_to_encode;
       if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
@@ -7915,12 +6947,12 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
   }
 }
 
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode) {
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode) {
   VP9_COMMON *cm = &cpi->common;
   int hr = 0, hs = 0, vr = 0, vs = 0;
 
-  if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+  if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1;
 
   Scale2Ratio(horiz_mode, &hr, &hs);
   Scale2Ratio(vert_mode, &vr, &vs);
@@ -7942,9 +6974,11 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height) {
   VP9_COMMON *cm = &cpi->common;
 #if CONFIG_VP9_HIGHBITDEPTH
-  update_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+  update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x,
+                       cpi->common.subsampling_y);
 #else
-  update_initial_width(cpi, 0, 1, 1);
+  update_initial_width(cpi, 0, cpi->common.subsampling_x,
+                       cpi->common.subsampling_y);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 7aa4bf7dca..7136f7faa3 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -14,8 +14,11 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_tpl.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
@@ -38,6 +41,7 @@
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_ext_ratectrl.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_job_queue.h"
 #include "vp9/encoder/vp9_lookahead.h"
@@ -88,13 +92,6 @@ typedef enum {
   ENCODE_BREAKOUT_LIMITED = 2
 } ENCODE_BREAKOUT_TYPE;
 
-typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
 typedef enum {
   // Good Quality Fast Encoding. The encoder balances quality with the amount of
   // time it takes to encode the output. Speed setting controls how fast.
@@ -147,6 +144,12 @@ typedef enum {
   kVeryHighSad = 6,
 } CONTENT_STATE_SB;
 
+typedef enum {
+  LOOPFILTER_ALL = 0,
+  LOOPFILTER_REFERENCE = 1,  // Disable loopfilter on non reference frames.
+  NO_LOOPFILTER = 2,         // Disable loopfilter on all frames.
+} LOOPFILTER_CONTROL;
+
 typedef struct VP9EncoderConfig {
   BITSTREAM_PROFILE profile;
   vpx_bit_depth_t bit_depth;     // Codec bit-depth.
@@ -265,10 +268,6 @@ typedef struct VP9EncoderConfig {
 
   vpx_fixed_buf_t two_pass_stats_in;
 
-#if CONFIG_FP_MB_STATS
-  vpx_fixed_buf_t firstpass_mb_stats_in;
-#endif
-
   vp8e_tuning tuning;
   vp9e_tune_content content;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -283,6 +282,7 @@ typedef struct VP9EncoderConfig {
   int row_mt;
   unsigned int motion_vector_unit_test;
   int delta_q_uv;
+  int use_simple_encode_api;  // Use SimpleEncode APIs or not
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -331,15 +331,14 @@ typedef struct TplDepFrame {
 typedef struct TileDataEnc {
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int8_t mode_map[BLOCK_SIZES][MAX_MODES];
   FIRSTPASS_DATA fp_data;
   VP9RowMTSync row_mt_sync;
 
   // Used for adaptive_rd_thresh with row multithreading
   int *row_base_thresh_freq_fact;
+  MV firstpass_top_mv;
 } TileDataEnc;
 
 typedef struct RowMTInfo {
@@ -508,6 +507,7 @@ typedef struct EncFrameBuf {
 } EncFrameBuf;
 
 // Maximum operating frame buffer size needed for a GOP using ARF reference.
+// This is used to allocate the memory for TPL stats for a GOP.
 #define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS)
 #define MAX_KMEANS_GROUPS 8
 
@@ -560,19 +560,52 @@ static INLINE int gop_command_coding_frame_count(
   return gop_command->show_frame_count + gop_command->use_alt_ref;
 }
 
+// TODO(angiebird): See if we can merge this one with FrameType in
+// simple_encode.h
+typedef enum ENCODE_FRAME_TYPE {
+  ENCODE_FRAME_TYPE_KEY,
+  ENCODE_FRAME_TYPE_INTER,
+  ENCODE_FRAME_TYPE_ALTREF,
+  ENCODE_FRAME_TYPE_OVERLAY,
+  ENCODE_FRAME_TYPE_GOLDEN,
+  ENCODE_FRAME_TYPES,
+} ENCODE_FRAME_TYPE;
+
+// TODO(angiebird): Merge this function with get_frame_type_from_update_type()
+static INLINE ENCODE_FRAME_TYPE
+get_encode_frame_type(FRAME_UPDATE_TYPE update_type) {
+  switch (update_type) {
+    case KF_UPDATE: return ENCODE_FRAME_TYPE_KEY;
+    case ARF_UPDATE: return ENCODE_FRAME_TYPE_ALTREF;
+    case GF_UPDATE: return ENCODE_FRAME_TYPE_GOLDEN;
+    case OVERLAY_UPDATE: return ENCODE_FRAME_TYPE_OVERLAY;
+    case LF_UPDATE: return ENCODE_FRAME_TYPE_INTER;
+    default:
+      fprintf(stderr, "Unsupported update_type %d\n", update_type);
+      abort();
+      return ENCODE_FRAME_TYPE_INTER;
+  }
+}
+
+typedef struct RATE_QSTEP_MODEL {
+  // The rq model predicts the bit usage as follows.
+  // rate = bias - ratio * log2(q_step)
+  int ready;
+  double bias;
+  double ratio;
+} RATE_QSTEP_MODEL;
+
 typedef struct ENCODE_COMMAND {
   int use_external_quantize_index;
   int external_quantize_index;
+
+  int use_external_target_frame_bits;
+  int target_frame_bits;
+  double target_frame_bits_error_percent;
+
   GOP_COMMAND gop_command;
 } ENCODE_COMMAND;
 
-static INLINE void encode_command_init(ENCODE_COMMAND *encode_command) {
-  vp9_zero(*encode_command);
-  encode_command->use_external_quantize_index = 0;
-  encode_command->external_quantize_index = -1;
-  gop_command_off(&encode_command->gop_command);
-}
-
 static INLINE void encode_command_set_gop_command(
     ENCODE_COMMAND *encode_command, GOP_COMMAND gop_command) {
   encode_command->gop_command = gop_command;
@@ -590,11 +623,103 @@ static INLINE void encode_command_reset_external_quantize_index(
   encode_command->external_quantize_index = -1;
 }
 
+static INLINE void encode_command_set_target_frame_bits(
+    ENCODE_COMMAND *encode_command, int target_frame_bits,
+    double target_frame_bits_error_percent) {
+  encode_command->use_external_target_frame_bits = 1;
+  encode_command->target_frame_bits = target_frame_bits;
+  encode_command->target_frame_bits_error_percent =
+      target_frame_bits_error_percent;
+}
+
+static INLINE void encode_command_reset_target_frame_bits(
+    ENCODE_COMMAND *encode_command) {
+  encode_command->use_external_target_frame_bits = 0;
+  encode_command->target_frame_bits = -1;
+  encode_command->target_frame_bits_error_percent = 0;
+}
+
+static INLINE void encode_command_init(ENCODE_COMMAND *encode_command) {
+  vp9_zero(*encode_command);
+  encode_command_reset_external_quantize_index(encode_command);
+  encode_command_reset_target_frame_bits(encode_command);
+  gop_command_off(&encode_command->gop_command);
+}
+
 // Returns number of units in size of 4, if not multiple not a multiple of 4,
 // round it up. For example, size is 7, return 2.
 static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; }
+// Returns number of units in size of 16, if not multiple not a multiple of 16,
+// round it up. For example, size is 17, return 2.
+static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; }
 #endif  // CONFIG_RATE_CTRL
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "vpx_ports/vpx_timer.h"
+// Adjust the following to add new components.
+typedef enum {
+  vp9_get_compressed_data_time,
+  vp9_temporal_filter_time,
+  vp9_rc_get_second_pass_params_time,
+  setup_tpl_stats_time,
+  Pass2Encode_time,
+
+  encode_with_recode_loop_time,
+  loopfilter_frame_time,
+  vp9_pack_bitstream_time,
+
+  encode_frame_internal_time,
+  rd_pick_partition_time,
+  rd_pick_sb_modes_time,
+  encode_sb_time,
+
+  vp9_rd_pick_inter_mode_sb_time,
+  vp9_rd_pick_inter_mode_sub8x8_time,
+
+  intra_mode_search_time,
+  handle_inter_mode_time,
+  single_motion_search_time,
+  joint_motion_search_time,
+  interp_filter_time,
+
+  kTimingComponents,
+} TIMING_COMPONENT;
+
+static INLINE char const *get_component_name(int index) {
+  switch (index) {
+    case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time";
+    case vp9_temporal_filter_time: return "vp9_temporal_filter_time";
+    case vp9_rc_get_second_pass_params_time:
+      return "vp9_rc_get_second_pass_params_time";
+    case setup_tpl_stats_time: return "setup_tpl_stats_time";
+    case Pass2Encode_time: return "Pass2Encode_time";
+
+    case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+    case loopfilter_frame_time: return "loopfilter_frame_time";
+    case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time";
+
+    case encode_frame_internal_time: return "encode_frame_internal_time";
+    case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+    case encode_sb_time: return "encode_sb_time";
+
+    case vp9_rd_pick_inter_mode_sb_time:
+      return "vp9_rd_pick_inter_mode_sb_time";
+    case vp9_rd_pick_inter_mode_sub8x8_time:
+      return "vp9_rd_pick_inter_mode_sub8x8_time";
+
+    case intra_mode_search_time: return "intra_mode_search_time";
+    case handle_inter_mode_time: return "handle_inter_mode_time";
+    case single_motion_search_time: return "single_motion_search_time";
+    case joint_motion_search_time: return "joint_motion_search_time";
+    case interp_filter_time: return "interp_filter_time";
+
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 typedef struct VP9_COMP {
   FRAME_INFO frame_info;
   QUANTS quants;
@@ -621,6 +746,8 @@ typedef struct VP9_COMP {
 
   BLOCK_SIZE tpl_bsize;
   TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
+  // Used to store TPL stats before propagation
+  VpxTplGopStats tpl_gop_stats;
   YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
   EncFrameBuf enc_frame_buf[REF_FRAMES];
 #if CONFIG_MULTITHREAD
@@ -643,9 +770,6 @@ typedef struct VP9_COMP {
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
 
-  // For a still frame, this flag is set to 1 to skip partition search.
-  int partition_search_skippable_frame;
-
   int scaled_ref_idx[REFS_PER_FRAME];
   int lst_fb_idx;
   int gld_fb_idx;
@@ -679,6 +803,7 @@ typedef struct VP9_COMP {
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
 
+  RD_CONTROL rd_ctrl;
   RD_OPT rd;
 
   CODING_CONTEXT coding_context;
@@ -722,7 +847,7 @@ typedef struct VP9_COMP {
 
   uint8_t *skin_map;
 
-  // segment threashold for encode breakout
+  // segment threshold for encode breakout
   int segment_encode_breakout[MAX_SEGMENTS];
 
   CYCLIC_REFRESH *cyclic_refresh;
@@ -737,10 +862,6 @@ typedef struct VP9_COMP {
   uint64_t time_pick_lpf;
   uint64_t time_encode_sb_row;
 
-#if CONFIG_FP_MB_STATS
-  int use_fp_mb_stats;
-#endif
-
   TWO_PASS twopass;
 
   // Force recalculation of segment_ids for each mode info
@@ -800,12 +921,15 @@ typedef struct VP9_COMP {
                     // number of MBs in the current frame when the frame is
                     // scaled.
 
+  int last_coded_width;
+  int last_coded_height;
+
   int use_svc;
 
   SVC svc;
 
   // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
-  diff *source_diff_var;
+  Diff *source_diff_var;
   // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
   unsigned int source_var_thresh;
   int frames_till_next_var_check;
@@ -892,6 +1016,8 @@ typedef struct VP9_COMP {
 
   int compute_source_sad_onepass;
 
+  int compute_frame_low_motion_onepass;
+
   LevelConstraint level_constraint;
 
   uint8_t *count_arf_frame_usage;
@@ -899,10 +1025,42 @@ typedef struct VP9_COMP {
 
   int multi_layer_arf;
   vpx_roi_map_t roi;
+
+  LOOPFILTER_CONTROL loopfilter_ctrl;
 #if CONFIG_RATE_CTRL
   ENCODE_COMMAND encode_command;
   PARTITION_INFO *partition_info;
   MOTION_VECTOR_INFO *motion_vector_info;
+  MOTION_VECTOR_INFO *fp_motion_vector_info;
+  TplDepStats *tpl_stats_info;
+
+  RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES];
+#endif
+  EXT_RATECTRL ext_ratectrl;
+
+  int fixed_qp_onepass;
+
+  // Flag to keep track of dynamic change in deadline mode
+  // (good/best/realtime).
+  MODE deadline_mode_previous_frame;
+
+  // Flag to disable scene detection when rtc rate control library is used.
+  int disable_scene_detection_rtc_ratectrl;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  /*!
+   * component_time[] are initialized to zero while encoder starts.
+   */
+  uint64_t component_time[kTimingComponents];
+  /*!
+   * Stores timing for individual components between calls of start_timing()
+   * and end_timing().
+   */
+  struct vpx_usec_timer component_timer[kTimingComponents];
+  /*!
+   * frame_component_time[] are initialized to zero at beginning of each frame.
+   */
+  uint64_t frame_component_time[kTimingComponents];
 #endif
 } VP9_COMP;
 
@@ -914,7 +1072,7 @@ static INLINE void partition_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->partition_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->partition_info,
                   (PARTITION_INFO *)vpx_calloc(unit_width * unit_height,
                                                sizeof(PARTITION_INFO)));
   memset(cpi->partition_info, 0,
@@ -928,6 +1086,13 @@ static INLINE void free_partition_info(struct VP9_COMP *cpi) {
   cpi->partition_info = NULL;
 }
 
+static INLINE void reset_mv_info(MOTION_VECTOR_INFO *mv_info) {
+  mv_info->ref_frame[0] = NO_REF_FRAME;
+  mv_info->ref_frame[1] = NO_REF_FRAME;
+  mv_info->mv[0].as_int = INVALID_MV;
+  mv_info->mv[1].as_int = INVALID_MV;
+}
+
 // Allocates memory for the motion vector information.
 // The unit size is each 4x4 block.
 // Only called once in vp9_create_compressor().
@@ -935,7 +1100,7 @@ static INLINE void motion_vector_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->motion_vector_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info,
                   (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
                                                    sizeof(MOTION_VECTOR_INFO)));
   memset(cpi->motion_vector_info, 0,
@@ -949,6 +1114,53 @@ static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) {
   cpi->motion_vector_info = NULL;
 }
 
+// Allocates memory for the tpl stats information.
+// Only called once in vp9_create_compressor().
+static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CHECK_MEM_ERROR(
+      &cm->error, cpi->tpl_stats_info,
+      (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats)));
+  memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats));
+}
+
+// Frees memory of the tpl stats information.
+// Only called once in dealloc_compressor_data().
+static INLINE void free_tpl_stats_info(struct VP9_COMP *cpi) {
+  vpx_free(cpi->tpl_stats_info);
+  cpi->tpl_stats_info = NULL;
+}
+
+// Allocates memory for the first pass motion vector information.
+// The unit size is each 16x16 block.
+// Only called once in vp9_create_compressor().
+static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width);
+  const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height);
+  CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info,
+                  (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
+                                                   sizeof(MOTION_VECTOR_INFO)));
+}
+
+static INLINE void fp_motion_vector_info_reset(
+    int frame_width, int frame_height,
+    MOTION_VECTOR_INFO *fp_motion_vector_info) {
+  const int unit_width = get_num_unit_16x16(frame_width);
+  const int unit_height = get_num_unit_16x16(frame_height);
+  int i;
+  for (i = 0; i < unit_width * unit_height; ++i) {
+    reset_mv_info(fp_motion_vector_info + i);
+  }
+}
+
+// Frees memory of the first pass motion vector information.
+// Only called once in dealloc_compressor_data().
+static INLINE void free_fp_motion_vector_info(struct VP9_COMP *cpi) {
+  vpx_free(cpi->fp_motion_vector_info);
+  cpi->fp_motion_vector_info = NULL;
+}
+
 // This is the c-version counter part of ImageBuffer
 typedef struct IMAGE_BUFFER {
   int allocated;
@@ -956,6 +1168,17 @@ typedef struct IMAGE_BUFFER {
   int plane_height[3];
   uint8_t *plane_buffer[3];
 } IMAGE_BUFFER;
+
+#define RATE_CTRL_MAX_RECODE_NUM 7
+
+typedef struct RATE_QINDEX_HISTORY {
+  int recode_count;
+  int q_index_history[RATE_CTRL_MAX_RECODE_NUM];
+  int rate_history[RATE_CTRL_MAX_RECODE_NUM];
+  int q_index_high;
+  int q_index_low;
+} RATE_QINDEX_HISTORY;
+
 #endif  // CONFIG_RATE_CTRL
 
 typedef struct ENCODE_FRAME_RESULT {
@@ -970,7 +1193,9 @@ typedef struct ENCODE_FRAME_RESULT {
   FRAME_COUNTS frame_counts;
   const PARTITION_INFO *partition_info;
   const MOTION_VECTOR_INFO *motion_vector_info;
+  const TplDepStats *tpl_stats_info;
   IMAGE_BUFFER coded_frame;
+  RATE_QINDEX_HISTORY rq_history;
 #endif  // CONFIG_RATE_CTRL
   int quantize_index;
 } ENCODE_FRAME_RESULT;
@@ -1018,8 +1243,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
 int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols);
 
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);
 
 int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height);
@@ -1062,6 +1287,13 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
+static INLINE int ref_frame_to_flag(int8_t ref_frame) {
+  static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                          VP9_ALT_FLAG };
+  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+  return kVp9RefFlagList[ref_frame];
+}
+
 static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
   if (ref_frame == LAST_FRAME) {
@@ -1146,8 +1378,21 @@ void vp9_scale_references(VP9_COMP *cpi);
 
 void vp9_update_reference_frames(VP9_COMP *cpi);
 
+void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
+                            RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES],
+                            int *ref_frame_coding_indexes,
+                            int *ref_frame_valid_list);
+
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst, int bd);
+#else
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
     VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
     YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
@@ -1159,7 +1404,7 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
-static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
+static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) {
   return (cpi->use_svc && cpi->oxcf.pass == 0);
 }
 
@@ -1232,9 +1477,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]);
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]);
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
@@ -1244,6 +1490,171 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);
 
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
+static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs,
+                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                         sizeof(*new_fb_ptr->mvs)));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+                          int *const comp_cost[2]) {
+  assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
+  assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
+  return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+         comp_cost[1][mv->col];
+}
+
+static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv,
+                                 const MV *ref, int sad_per_bit) {
+  MV diff;
+  diff.row = mv->row - ref->row;
+  diff.col = mv->col - ref->col;
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+      VP9_PROB_COST_SHIFT);
+}
+
+static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full,
+                                        const MV *ref_mv_full,
+                                        vpx_sad_fn_t sad_fn_ptr, int sadpb) {
+  const int src_buf_stride = x->plane[0].src.stride;
+  const uint8_t *const src_buf = x->plane[0].src.buf;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pred_buf_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *const pred_buf =
+      xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+  uint32_t start_mv_sad =
+      sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+  start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb);
+
+  return start_mv_sad;
+}
+
+static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
+                                  int subsampling_dim, int blk_dim) {
+  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
+}
+
+// Compute the sum of squares on all visible 4x4s in the transform block.
+static int64_t sum_squares_visible(const MACROBLOCKD *xd,
+                                   const struct macroblockd_plane *const pd,
+                                   const int16_t *diff, const int diff_stride,
+                                   int blk_row, int blk_col,
+                                   const BLOCK_SIZE plane_bsize,
+                                   const BLOCK_SIZE tx_bsize,
+                                   int *visible_width, int *visible_height) {
+  int64_t sse;
+  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+  const int b4x4s_to_right_edge = num_4x4_to_edge(
+      plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col);
+  const int b4x4s_to_bottom_edge = num_4x4_to_edge(
+      plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row);
+  if (tx_bsize == BLOCK_4X4 ||
+      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+    assert(tx_4x4_w == tx_4x4_h);
+    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
+    *visible_width = tx_4x4_w << 2;
+    *visible_height = tx_4x4_h << 2;
+  } else {
+    int r, c;
+    const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+    const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+    sse = 0;
+    // if we are in the unrestricted motion border.
+    for (r = 0; r < max_r; ++r) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      for (c = 0; c < max_c; ++c) {
+        sse += (int64_t)vpx_sum_squares_2d_i16(
+            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
+      }
+    }
+    *visible_width = max_c << 2;
+    *visible_height = max_r << 2;
+  }
+  return sse;
+}
+
+// Check if trellis coefficient optimization of the transform block is enabled.
+static INLINE int do_trellis_opt(const struct macroblockd_plane *pd,
+                                 const int16_t *src_diff, int diff_stride,
+                                 int blk_row, int blk_col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 void *arg) {
+  const struct encode_b_args *const args = (struct encode_b_args *)arg;
+  const MACROBLOCK *const x = args->x;
+
+  switch (args->enable_trellis_opt) {
+    case DISABLE_TRELLIS_OPT: return 0;
+    case ENABLE_TRELLIS_OPT: return 1;
+    case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: {
+      vpx_clear_system_state();
+
+      return (args->trellis_opt_thresh > 0.0)
+                 ? (x->log_block_src_var <= args->trellis_opt_thresh)
+                 : 1;
+    }
+    case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: {
+      const MACROBLOCKD *const xd = &x->e_mbd;
+      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int dequant_shift =
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+#else
+      const int dequant_shift = 3;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      const int qstep = pd->dequant[1] >> dequant_shift;
+      int *sse_calc_done = args->sse_calc_done;
+      int64_t *sse = args->sse;
+      int visible_width = 0, visible_height = 0;
+
+      // TODO: Enable the sf for high bit-depth case
+      if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse ||
+          !sse_calc_done)
+        return 1;
+
+      *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row,
+                                 blk_col, plane_bsize, tx_bsize, &visible_width,
+                                 &visible_height);
+      *sse_calc_done = 1;
+
+      vpx_clear_system_state();
+
+      return (*(sse) <= (int64_t)visible_width * visible_height * qstep *
+                            qstep * args->trellis_opt_thresh);
+    }
+    default: assert(0 && "Invalid trellis optimization method."); return 1;
+  }
+}
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_mark(&cpi->component_timer[component]);
+  cpi->frame_component_time[component] +=
+      vpx_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+  switch (type) {
+    case 0: return "KEY_FRAME";
+    case 1: return "INTER_FRAME";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index e7f8a537d4..a8d1cb7a7a 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
@@ -79,60 +81,59 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
   VP9_COMMON *const cm = &cpi->common;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   int i;
+  // While using SVC, we need to allocate threads according to the highest
+  // resolution. When row based multithreading is enabled, it is OK to
+  // allocate more threads than the number of max tile columns.
+  if (cpi->use_svc && !cpi->row_mt) {
+    int max_tile_cols = get_max_tile_cols(cpi);
+    num_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
+  }
+  assert(num_workers > 0);
+  if (num_workers == cpi->num_workers) return;
+  vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+  vp9_encode_free_mt_data(cpi);
 
-  // Only run once to create threads and allocate thread data.
-  if (cpi->num_workers == 0) {
-    int allocated_workers = num_workers;
-
-    // While using SVC, we need to allocate threads according to the highest
-    // resolution. When row based multithreading is enabled, it is OK to
-    // allocate more threads than the number of max tile columns.
-    if (cpi->use_svc && !cpi->row_mt) {
-      int max_tile_cols = get_max_tile_cols(cpi);
-      allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
-    }
-
-    CHECK_MEM_ERROR(cm, cpi->workers,
-                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
+  CHECK_MEM_ERROR(&cm->error, cpi->workers,
+                  vpx_malloc(num_workers * sizeof(*cpi->workers)));
 
-    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                    vpx_calloc(allocated_workers, sizeof(*cpi->tile_thr_data)));
+  CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data,
+                  vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
 
-    for (i = 0; i < allocated_workers; i++) {
-      VPxWorker *const worker = &cpi->workers[i];
-      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *thread_data = &cpi->tile_thr_data[i];
 
-      ++cpi->num_workers;
-      winterface->init(worker);
+    ++cpi->num_workers;
+    winterface->init(worker);
 
-      if (i < allocated_workers - 1) {
-        thread_data->cpi = cpi;
+    if (i < num_workers - 1) {
+      thread_data->cpi = cpi;
 
-        // Allocate thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td,
-                        vpx_memalign(32, sizeof(*thread_data->td)));
-        vp9_zero(*thread_data->td);
+      // Allocate thread data.
+      CHECK_MEM_ERROR(&cm->error, thread_data->td,
+                      vpx_memalign(32, sizeof(*thread_data->td)));
+      vp9_zero(*thread_data->td);
 
-        // Set up pc_tree.
-        thread_data->td->leaf_tree = NULL;
-        thread_data->td->pc_tree = NULL;
-        vp9_setup_pc_tree(cm, thread_data->td);
+      // Set up pc_tree.
+      thread_data->td->leaf_tree = NULL;
+      thread_data->td->pc_tree = NULL;
+      vp9_setup_pc_tree(cm, thread_data->td);
 
-        // Allocate frame counters in thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td->counts,
-                        vpx_calloc(1, sizeof(*thread_data->td->counts)));
+      // Allocate frame counters in thread data.
+      CHECK_MEM_ERROR(&cm->error, thread_data->td->counts,
+                      vpx_calloc(1, sizeof(*thread_data->td->counts)));
 
-        // Create threads
-        if (!winterface->reset(worker))
-          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
-                             "Tile encoder thread creation failed");
-      } else {
-        // Main thread acts as a worker and uses the thread data in cpi.
-        thread_data->cpi = cpi;
-        thread_data->td = &cpi->td;
-      }
-      winterface->sync(worker);
+      // Create threads
+      if (!winterface->reset(worker))
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    } else {
+      // Main thread acts as a worker and uses the thread data in cpi.
+      thread_data->cpi = cpi;
+      thread_data->td = &cpi->td;
     }
+    winterface->sync(worker);
   }
 }
 
@@ -169,6 +170,29 @@ static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
   }
 }
 
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi) {
+  int t;
+  for (t = 0; t < cpi->num_workers; ++t) {
+    VPxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    vpx_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      vpx_free(thread_data->td->counts);
+      vp9_free_pc_tree(thread_data->td);
+      vpx_free(thread_data->td);
+    }
+  }
+  vpx_free(cpi->tile_thr_data);
+  cpi->tile_thr_data = NULL;
+  vpx_free(cpi->workers);
+  cpi->workers = NULL;
+  cpi->num_workers = 0;
+}
+
 void vp9_encode_tiles_mt(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -243,6 +267,7 @@ static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
   tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high;
   tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
   tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
+  tile_data->fp_data.new_mv_count += tile_data_t->fp_data.new_mv_count;
   tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
   tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
   tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
@@ -270,7 +295,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->mutex,
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex,
                     vpx_malloc(sizeof(*row_mt_sync->mutex) * rows));
     if (row_mt_sync->mutex) {
       for (i = 0; i < rows; ++i) {
@@ -278,7 +303,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
       }
     }
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->cond,
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond,
                     vpx_malloc(sizeof(*row_mt_sync->cond) * rows));
     if (row_mt_sync->cond) {
       for (i = 0; i < rows; ++i) {
@@ -288,7 +313,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+  CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col,
                   vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
 
   // Set up nsync.
diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h
index cda0293bcf..4c192da515 100644
--- a/vp9/encoder/vp9_ethread.h
+++ b/vp9/encoder/vp9_ethread.h
@@ -42,6 +42,11 @@ typedef struct VP9RowMTSyncData {
   int rows;
 } VP9RowMTSync;
 
+// Frees EncWorkerData related allocations made by vp9_encode_*_mt().
+// row_mt specific data is freed with vp9_row_mt_mem_dealloc() and is not
+// called by this function.
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi);
+
 void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
 
 void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
new file mode 100644
index 0000000000..4664e8c5e2
--- /dev/null
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -0,0 +1,281 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+
+#include "vp9/encoder/vp9_ext_ratectrl.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
+
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  vp9_zero(*ext_ratectrl);
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+                                 vpx_rc_config_t ratectrl_config,
+                                 EXT_RATECTRL *ext_ratectrl) {
+  vpx_rc_status_t rc_status;
+  vpx_rc_firstpass_stats_t *rc_firstpass_stats;
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  vp9_extrc_delete(ext_ratectrl);
+  ext_ratectrl->funcs = funcs;
+  ext_ratectrl->ratectrl_config = ratectrl_config;
+  rc_status = ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv,
+                                               &ext_ratectrl->ratectrl_config,
+                                               &ext_ratectrl->model);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats;
+  rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count;
+  rc_firstpass_stats->frame_stats =
+      vpx_malloc(sizeof(*rc_firstpass_stats->frame_stats) *
+                 rc_firstpass_stats->num_frames);
+  if (rc_firstpass_stats->frame_stats == NULL) {
+    return VPX_CODEC_MEM_ERROR;
+  }
+  ext_ratectrl->ready = 1;
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready) {
+    vpx_rc_status_t rc_status =
+        ext_ratectrl->funcs.delete_model(ext_ratectrl->model);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+    vpx_free(ext_ratectrl->rc_firstpass_stats.frame_stats);
+  }
+  return vp9_extrc_init(ext_ratectrl);
+}
+
+static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
+                                   vpx_rc_frame_stats_t *rc_frame_stats) {
+  rc_frame_stats->frame = stats->frame;
+  rc_frame_stats->weight = stats->weight;
+  rc_frame_stats->intra_error = stats->intra_error;
+  rc_frame_stats->coded_error = stats->coded_error;
+  rc_frame_stats->sr_coded_error = stats->sr_coded_error;
+  rc_frame_stats->frame_noise_energy = stats->frame_noise_energy;
+  rc_frame_stats->pcnt_inter = stats->pcnt_inter;
+  rc_frame_stats->pcnt_motion = stats->pcnt_motion;
+  rc_frame_stats->pcnt_second_ref = stats->pcnt_second_ref;
+  rc_frame_stats->pcnt_neutral = stats->pcnt_neutral;
+  rc_frame_stats->pcnt_intra_low = stats->pcnt_intra_low;
+  rc_frame_stats->pcnt_intra_high = stats->pcnt_intra_high;
+  rc_frame_stats->intra_skip_pct = stats->intra_skip_pct;
+  rc_frame_stats->intra_smooth_pct = stats->intra_smooth_pct;
+  rc_frame_stats->inactive_zone_rows = stats->inactive_zone_rows;
+  rc_frame_stats->inactive_zone_cols = stats->inactive_zone_cols;
+  rc_frame_stats->MVr = stats->MVr;
+  rc_frame_stats->mvr_abs = stats->mvr_abs;
+  rc_frame_stats->MVc = stats->MVc;
+  rc_frame_stats->mvc_abs = stats->mvc_abs;
+  rc_frame_stats->MVrv = stats->MVrv;
+  rc_frame_stats->MVcv = stats->MVcv;
+  rc_frame_stats->mv_in_out_count = stats->mv_in_out_count;
+  rc_frame_stats->duration = stats->duration;
+  rc_frame_stats->count = stats->count;
+  rc_frame_stats->new_mv_count = stats->new_mv_count;
+}
+
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+    EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready) {
+    vpx_rc_status_t rc_status;
+    vpx_rc_firstpass_stats_t *rc_firstpass_stats =
+        &ext_ratectrl->rc_firstpass_stats;
+    int i;
+    assert(rc_firstpass_stats->num_frames == first_pass_info->num_frames);
+    for (i = 0; i < rc_firstpass_stats->num_frames; ++i) {
+      gen_rc_firstpass_stats(&first_pass_info->stats[i],
+                             &rc_firstpass_stats->frame_stats[i]);
+    }
+    rc_status = ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model,
+                                                         rc_firstpass_stats);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+                                         const VpxTplGopStats *tpl_gop_stats) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready && ext_ratectrl->funcs.send_tpl_gop_stats != NULL) {
+    vpx_rc_status_t rc_status = ext_ratectrl->funcs.send_tpl_gop_stats(
+        ext_ratectrl->model, tpl_gop_stats);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
+static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
+  // TODO(angiebird): Add unit test to make sure this function behaves like
+  // get_frame_type_from_update_type()
+  // TODO(angiebird): Merge this function with get_frame_type_from_update_type()
+  switch (update_type) {
+    case KF_UPDATE: return 0;       // kFrameTypeKey;
+    case ARF_UPDATE: return 2;      // kFrameTypeAltRef;
+    case GF_UPDATE: return 4;       // kFrameTypeGolden;
+    case OVERLAY_UPDATE: return 3;  // kFrameTypeOverlay;
+    case LF_UPDATE: return 1;       // kFrameTypeInter;
+    default:
+      fprintf(stderr, "Unsupported update_type %d\n", update_type);
+      abort();
+  }
+}
+
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    vpx_rc_encodeframe_decision_t *encode_frame_decision) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) {
+    vpx_rc_status_t rc_status;
+    vpx_rc_encodeframe_info_t encode_frame_info;
+    encode_frame_info.show_index = show_index;
+    encode_frame_info.coding_index = coding_index;
+    encode_frame_info.gop_index = gop_index;
+    encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+    encode_frame_info.gop_size = gop_size;
+    encode_frame_info.use_alt_ref = use_alt_ref;
+
+    vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+                           encode_frame_info.ref_frame_coding_indexes,
+                           encode_frame_info.ref_frame_valid_list);
+
+    rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
+        ext_ratectrl->model, &encode_frame_info, encode_frame_decision);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(
+    EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
+    const YV12_BUFFER_CONFIG *source_frame,
+    const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
+    uint32_t input_bit_depth, const int actual_encoding_qindex) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready) {
+    PSNR_STATS psnr;
+    vpx_rc_status_t rc_status;
+    vpx_rc_encodeframe_result_t encode_frame_result;
+    encode_frame_result.bit_count = bit_count;
+    encode_frame_result.pixel_count =
+        source_frame->y_crop_width * source_frame->y_crop_height +
+        2 * source_frame->uv_crop_width * source_frame->uv_crop_height;
+    encode_frame_result.actual_encoding_qindex = actual_encoding_qindex;
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth,
+                         input_bit_depth);
+#else
+    (void)bit_depth;
+    (void)input_bit_depth;
+    vpx_calc_psnr(source_frame, coded_frame, &psnr);
+#endif
+    encode_frame_result.sse = psnr.sse[0];
+    rc_status = ext_ratectrl->funcs.update_encodeframe_result(
+        ext_ratectrl->model, &encode_frame_result);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+    vpx_rc_gop_decision_t *gop_decision) {
+  vpx_rc_status_t rc_status;
+  if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+      (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
+                                                   gop_info, gop_decision);
+  if (gop_decision->use_alt_ref) {
+    const int arf_constraint =
+        gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
+        gop_decision->gop_coding_frames < gop_info->lag_in_frames;
+    if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
+  }
+  // TODO(chengchen): Take min and max gf interval from the model
+  // and overwrite libvpx's decision so that we can get rid
+  // of one of the checks here.
+  if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
+          gop_info->max_gf_interval) {
+    return VPX_CODEC_ERROR;
+  }
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    int *rdmult) {
+  vpx_rc_status_t rc_status;
+  vpx_rc_encodeframe_info_t encode_frame_info;
+  if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+      (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  encode_frame_info.show_index = show_index;
+  encode_frame_info.coding_index = coding_index;
+  encode_frame_info.gop_index = gop_index;
+  encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+  encode_frame_info.gop_size = gop_size;
+  encode_frame_info.use_alt_ref = use_alt_ref;
+
+  vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+                         encode_frame_info.ref_frame_coding_indexes,
+                         encode_frame_info.ref_frame_valid_list);
+  rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model,
+                                                   &encode_frame_info, rdmult);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  return VPX_CODEC_OK;
+}
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
new file mode 100644
index 0000000000..b04580c1d4
--- /dev/null
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
+#define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
+
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
+#include "vp9/encoder/vp9_firstpass.h"
+
+typedef struct EXT_RATECTRL {
+  int ready;
+  int ext_rdmult;
+  vpx_rc_model_t model;
+  vpx_rc_funcs_t funcs;
+  vpx_rc_config_t ratectrl_config;
+  vpx_rc_firstpass_stats_t rc_firstpass_stats;
+} EXT_RATECTRL;
+
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl);
+
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+                                 vpx_rc_config_t ratectrl_config,
+                                 EXT_RATECTRL *ext_ratectrl);
+
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
+
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+    EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info);
+
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+                                         const VpxTplGopStats *tpl_gop_stats);
+
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    vpx_rc_encodeframe_decision_t *encode_frame_decision);
+
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(
+    EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
+    const YV12_BUFFER_CONFIG *source_frame,
+    const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
+    uint32_t input_bit_depth, const int actual_encoding_qindex);
+
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+    vpx_rc_gop_decision_t *gop_decision);
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    int *rdmult);
+
+#endif  // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 45b003e1e8..a9cdf5353f 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -54,6 +54,30 @@
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define LOW_CODED_ERR_PER_MB 10.0
+#define NCOUNT_FRAME_II_THRESH 6.0
+#define BASELINE_ERR_PER_MB 12500.0
+#define GF_MAX_FRAME_BOOST 96.0
+
+#ifdef AGGRESSIVE_VBR
+#define KF_MIN_FRAME_BOOST 40.0
+#define KF_MAX_FRAME_BOOST 80.0
+#define MAX_KF_TOT_BOOST 4800
+#else
+#define KF_MIN_FRAME_BOOST 40.0
+#define KF_MAX_FRAME_BOOST 96.0
+#define MAX_KF_TOT_BOOST 5400
+#endif
+
+#define DEFAULT_ZM_FACTOR 0.5
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+#define AV_WQ_FACTOR 4.0
+
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
 
 #if ARF_STATS_OUTPUT
@@ -111,17 +135,6 @@ static void output_stats(FIRSTPASS_STATS *stats) {
 #endif
 }
 
-#if CONFIG_FP_MB_STATS
-static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm,
-                              struct vpx_codec_pkt_list *pktlist) {
-  struct vpx_codec_cx_pkt pkt;
-  pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
-  pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
-  pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
-  vpx_codec_pkt_list_add(pktlist, &pkt);
-}
-#endif
-
 static void zero_stats(FIRSTPASS_STATS *section) {
   section->frame = 0.0;
   section->weight = 0.0;
@@ -139,6 +152,7 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->pcnt_intra_high = 0.0;
   section->inactive_zone_rows = 0.0;
   section->inactive_zone_cols = 0.0;
+  section->new_mv_count = 0.0;
   section->MVr = 0.0;
   section->mvr_abs = 0.0;
   section->MVc = 0.0;
@@ -170,6 +184,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->pcnt_intra_high += frame->pcnt_intra_high;
   section->inactive_zone_rows += frame->inactive_zone_rows;
   section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->new_mv_count += frame->new_mv_count;
   section->MVr += frame->MVr;
   section->mvr_abs += frame->mvr_abs;
   section->MVc += frame->MVc;
@@ -199,6 +214,7 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->pcnt_intra_high -= frame->pcnt_intra_high;
   section->inactive_zone_rows -= frame->inactive_zone_rows;
   section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->new_mv_count -= frame->new_mv_count;
   section->MVr -= frame->MVr;
   section->mvr_abs -= frame->mvr_abs;
   section->MVc -= frame->MVc;
@@ -348,7 +364,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_8_mse8x16;
         default: return vpx_highbd_8_mse16x16;
       }
-      break;
     case 10:
       switch (bsize) {
         case BLOCK_8X8: return vpx_highbd_10_mse8x8;
@@ -356,7 +371,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_10_mse8x16;
         default: return vpx_highbd_10_mse16x16;
       }
-      break;
     case 12:
       switch (bsize) {
         case BLOCK_8X8: return vpx_highbd_12_mse8x8;
@@ -364,7 +378,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_12_mse8x16;
         default: return vpx_highbd_12_mse16x16;
       }
-      break;
   }
 }
 
@@ -422,6 +435,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+  MV center_mv_full = ref_mv_full;
+  unsigned int start_mv_sad;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
 
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
@@ -442,10 +458,18 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  // Calculate SAD of the start mv
+  clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  start_mv_sad = get_start_mv_sad(x, &ref_mv_full, &center_mv_full,
+                                  cpi->fn_ptr[bsize].sdf, x->sadperbit16);
+  sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf;
+  sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df;
+
   // Center the initial step/diamond search on best mv.
-  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                    step_param, x->sadperbit16, &num00,
-                                    &v_fn_ptr, ref_mv);
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad,
+                                    &tmp_mv, step_param, x->sadperbit16, &num00,
+                                    &sad_fn_ptr, ref_mv);
   if (tmp_err < INT_MAX)
     tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
   if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
@@ -465,9 +489,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     if (num00) {
       --num00;
     } else {
-      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                        step_param + n, x->sadperbit16, &num00,
-                                        &v_fn_ptr, ref_mv);
+      tmp_err = cpi->diamond_search_sad(
+          x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n,
+          x->sadperbit16, &num00, &sad_fn_ptr, ref_mv);
       if (tmp_err < INT_MAX)
         tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
@@ -582,11 +606,11 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
 #define FP_MAX_DN_THRESH 24
 #define KERNEL_SIZE 3
 
-// Baseline Kernal weights for first pass noise metric
-static uint8_t fp_dn_kernal_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
+// Baseline Kernel weights for first pass noise metric
+static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
                                                              2, 1, 2, 1 };
 
-// Estimate noise at a single point based on the impace of a spatial kernal
+// Estimate noise at a single point based on the impact of a spatial kernel
 // on the point value
 static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int sum_weight = 0;
@@ -596,23 +620,23 @@ static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int diff;
   int dn_diff;
   uint8_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
   uint8_t dn_val;
   uint8_t centre_val = *src_ptr;
 
-  kernal_ptr = fp_dn_kernal_3;
+  kernel_ptr = fp_dn_kernel_3;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - stride - 1;
   for (i = 0; i < KERNEL_SIZE; ++i) {
     for (j = 0; j < KERNEL_SIZE; ++j) {
       diff = abs((int)centre_val - (int)tmp_ptr[j]);
       max_diff = VPXMAX(max_diff, diff);
       if (diff <= FP_DN_THRESH) {
-        sum_weight += *kernal_ptr;
-        sum_val += (int)tmp_ptr[j] * (int)*kernal_ptr;
+        sum_weight += *kernel_ptr;
+        sum_val += (int)tmp_ptr[j] * (int)*kernel_ptr;
       }
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -638,13 +662,13 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int dn_diff;
   uint8_t *tmp_ptr;
   uint16_t *tmp_ptr16;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
   uint16_t dn_val;
   uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr);
 
-  kernal_ptr = fp_dn_kernal_3;
+  kernel_ptr = fp_dn_kernel_3;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - stride - 1;
   for (i = 0; i < KERNEL_SIZE; ++i) {
     tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr);
@@ -652,10 +676,10 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
       diff = abs((int)centre_val - (int)tmp_ptr16[j]);
       max_diff = VPXMAX(max_diff, diff);
       if (diff <= FP_DN_THRESH) {
-        sum_weight += *kernal_ptr;
-        sum_val += (int)tmp_ptr16[j] * (int)*kernal_ptr;
+        sum_weight += *kernel_ptr;
+        sum_val += (int)tmp_ptr16[j] * (int)*kernel_ptr;
       }
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -780,6 +804,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
   fps->inactive_zone_cols = (double)0;
 
   if (fp_acc_data->mvcount > 0) {
+    fps->new_mv_count = (double)(fp_acc_data->new_mv_count) / num_mbs;
     fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;
     fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;
     fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;
@@ -796,6 +821,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
         (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);
     fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;
   } else {
+    fps->new_mv_count = 0.0;
     fps->MVr = 0.0;
     fps->mvr_abs = 0.0;
     fps->MVc = 0.0;
@@ -821,6 +847,7 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
   this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low;
   this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high;
   this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;
+  this_tile->fp_data.new_mv_count += fp_acc_data->new_mv_count;
   this_tile->fp_data.mvcount += fp_acc_data->mvcount;
   this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;
   this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;
@@ -839,6 +866,22 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
                    fp_acc_data->image_data_start_row);
 }
 
+#if CONFIG_RATE_CTRL
+static void store_fp_motion_vector(VP9_COMP *cpi, const MV *mv,
+                                   const int mb_row, const int mb_col,
+                                   MV_REFERENCE_FRAME frame_type,
+                                   const int mv_idx) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mb_index = mb_row * cm->mb_cols + mb_col;
+  MOTION_VECTOR_INFO *this_motion_vector_info =
+      &cpi->fp_motion_vector_info[mb_index];
+  this_motion_vector_info->ref_frame[mv_idx] = frame_type;
+  if (frame_type != INTRA_FRAME) {
+    this_motion_vector_info->mv[mv_idx].as_mv = *mv;
+  }
+}
+#endif  // CONFIG_RATE_CTRL
+
 #define NZ_MOTION_PENALTY 128
 #define INTRA_MODE_PENALTY 1024
 void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
@@ -875,6 +918,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
   double mb_neutral_count;
   int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
 
+  MV *first_top_mv = &tile_data->firstpass_top_mv;
+  MV last_nonzero_mv = { 0, 0 };
+
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
@@ -913,12 +959,12 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     int level_sample;
     const int mb_index = mb_row * cm->mb_cols + mb_col;
 
-#if CONFIG_FP_MB_STATS
-    const int mb_index = mb_row * cm->mb_cols + mb_col;
-#endif
-
     (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c);
 
+    if (mb_col == mb_col_start) {
+      last_nonzero_mv = *first_top_mv;
+    }
+
     // Adjust to the next column of MBs.
     x->plane[0].src.buf = cpi->Source->y_buffer +
                           mb_row * 16 * x->plane[0].src.stride + mb_col * 16;
@@ -1052,27 +1098,27 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     // Accumulate the intra error.
     fp_acc_data->intra_error += (int64_t)this_error;
 
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      // initialization
-      cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-    }
-#endif
-
     // Set up limit values for motion vectors to prevent them extending
     // outside the UMV borders.
     x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
     x->mv_limits.col_max =
         ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
-    // Other than for the first frame do a motion search.
-    if (cm->current_video_frame > 0) {
+    // Other than for intra-only frame do a motion search.
+    if (!frame_is_intra_only(cm)) {
       int tmp_err, motion_error, this_motion_error, raw_motion_error;
       // Assume 0,0 motion with no mv overhead.
       MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
       struct buf_2d unscaled_last_source_buf_2d;
       vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
 
+#if CONFIG_RATE_CTRL
+      if (cpi->oxcf.use_simple_encode_api) {
+        // Store zero mv as default
+        store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+      }
+#endif  // CONFIG_RAGE_CTRL
+
       xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1137,6 +1183,11 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
                 vp9_get_mvpred_var(x, &tmp_mv, &zero_mv, &v_fn_ptr, 0);
           }
         }
+#if CONFIG_RATE_CTRL
+        if (cpi->oxcf.use_simple_encode_api) {
+          store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+        }
+#endif  // CONFIG_RAGE_CTRL
 
         // Search in an older reference frame.
         if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
@@ -1158,6 +1209,12 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error);
+#if CONFIG_RATE_CTRL
+          if (cpi->oxcf.use_simple_encode_api) {
+            store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME,
+                                   1);
+          }
+#endif  // CONFIG_RAGE_CTRL
 
           if (gf_motion_error < motion_error && gf_motion_error < this_error)
             ++(fp_acc_data->second_ref_count);
@@ -1186,20 +1243,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
       best_ref_mv->row = 0;
       best_ref_mv->col = 0;
 
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        // intra prediction statistics
-        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
-        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-        if (this_error > FPMB_ERROR_LARGE_TH) {
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
-        } else if (this_error < FPMB_ERROR_SMALL_TH) {
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
-        }
-      }
-#endif
-
       if (motion_error <= this_error) {
         vpx_clear_system_state();
 
@@ -1231,7 +1274,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
         xd->mi[0]->mv[0].as_mv = mv;
         xd->mi[0]->tx_size = TX_4X4;
         xd->mi[0]->ref_frame[0] = LAST_FRAME;
-        xd->mi[0]->ref_frame[1] = NONE;
+        xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
         vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
         vp9_encode_sby_pass1(x, bsize);
         fp_acc_data->sum_mvr += mv.row;
@@ -1244,47 +1287,13 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
 
         *best_ref_mv = mv;
 
-#if CONFIG_FP_MB_STATS
-        if (cpi->use_fp_mb_stats) {
-          // inter prediction statistics
-          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-          cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-          if (this_error > FPMB_ERROR_LARGE_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
-          } else if (this_error < FPMB_ERROR_SMALL_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
-          }
-        }
-#endif
-
         if (!is_zero_mv(&mv)) {
           ++(fp_acc_data->mvcount);
-
-#if CONFIG_FP_MB_STATS
-          if (cpi->use_fp_mb_stats) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK;
-            // check estimated motion direction
-            if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
-              // right direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_MOTION_RIGHT_MASK;
-            } else if (mv.as_mv.row < 0 &&
-                       abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
-              // up direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK;
-            } else if (mv.as_mv.col < 0 &&
-                       abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
-              // left direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_MOTION_LEFT_MASK;
-            } else {
-              // down direction
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_MOTION_DOWN_MASK;
-            }
+          if (!is_equal_mv(&mv, &last_nonzero_mv)) {
+            ++(fp_acc_data->new_mv_count);
           }
-#endif
+          last_nonzero_mv = mv;
+
           // Does the row vector point inwards or outwards?
           if (mb_row < cm->mb_rows / 2) {
             if (mv.row > 0)
@@ -1331,9 +1340,17 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
       }
     } else {
       fp_acc_data->sr_coded_error += (int64_t)this_error;
+#if CONFIG_RATE_CTRL
+      if (cpi->oxcf.use_simple_encode_api) {
+        store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0);
+      }
+#endif  // CONFIG_RAGE_CTRL
     }
     fp_acc_data->coded_error += (int64_t)this_error;
 
+    if (mb_col == mb_col_start) {
+      *first_top_mv = last_nonzero_mv;
+    }
     recon_yoffset += 16;
     recon_uvoffset += uv_mb_height;
 
@@ -1356,6 +1373,14 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
   MV best_ref_mv;
   // Tiling is ignored in the first pass.
   vp9_tile_init(tile, cm, 0, 0);
+  tile_data.firstpass_top_mv = zero_mv;
+#if CONFIG_RATE_CTRL
+  if (cpi->oxcf.use_simple_encode_api) {
+    fp_motion_vector_info_reset(cpi->frame_info.frame_width,
+                                cpi->frame_info.frame_height,
+                                cpi->fp_motion_vector_info);
+  }
+#endif
 
   for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
     best_ref_mv = zero_mv;
@@ -1388,12 +1413,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
-  }
-#endif
-
   set_first_pass_params(cpi);
   vp9_set_quantizer(cpi, find_fp_qindex(cm->bit_depth));
 
@@ -1420,7 +1439,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
   if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
     CHECK_MEM_ERROR(
-        cm, cpi->twopass.fp_mb_float_stats,
+        &cm->error, cpi->twopass.fp_mb_float_stats,
         vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
 
   {
@@ -1446,7 +1465,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
     }
 
-    // Dont allow a value of 0 for duration.
+    // Don't allow a value of 0 for duration.
     // (Section duration is also defaulted to minimum of 1.0).
     fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start));
 
@@ -1454,15 +1473,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     twopass->this_frame_stats = fps;
     output_stats(&twopass->this_frame_stats);
     accumulate_stats(&twopass->total_stats, &fps);
-
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list);
-    }
-#endif
   }
 
-  // Copy the previous Last Frame back into gf and and arf buffers if
+  // Copy the previous Last Frame back into gf and arf buffers if
   // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
@@ -1491,22 +1504,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
                cm->ref_frame_map[cpi->lst_fb_idx]);
   }
 
-  // Use this to see what the first pass reconstruction looks like.
-  if (0) {
-    char filename[512];
-    FILE *recon_file;
-    snprintf(filename, sizeof(filename), "enc%04d.yuv",
-             (int)cm->current_video_frame);
-
-    if (cm->current_video_frame == 0)
-      recon_file = fopen(filename, "wb");
-    else
-      recon_file = fopen(filename, "ab");
-
-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
-
   // In the first pass, every frame is considered as a show frame.
   update_frame_indexes(cm, /*show_frame=*/1);
   if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
@@ -1679,7 +1676,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 
   // Scan the first pass file and calculate a modified score for each
   // frame that is used to distribute bits. The modified score is assumed
-  // to provide a linear basis for bit allocation. I.e a frame A with a score
+  // to provide a linear basis for bit allocation. I.e., a frame A with a score
   // that is double that of frame B will be allocated 2x as many bits.
   {
     double modified_score_total = 0.0;
@@ -1704,8 +1701,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
     }
 
     // Second scan using clamps based on the previous cycle average.
-    // This may modify the total and average somewhat but we dont bother with
-    // further itterations.
+    // This may modify the total and average somewhat but we don't bother with
+    // further iterations.
     modified_score_total = 0.0;
     s = twopass->stats_in;
     while (s < twopass->stats_in_end) {
@@ -1771,60 +1768,59 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   twopass->arnr_strength_adjustment = 0;
 }
 
-#define SR_DIFF_PART 0.0015
-#define INTRA_PART 0.005
-#define DEFAULT_DECAY_LIMIT 0.75
-#define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
-#define LOW_CODED_ERR_PER_MB 10.0
-#define NCOUNT_FRAME_II_THRESH 6.0
-
-static double get_sr_decay_rate(const FRAME_INFO *frame_info,
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It compares the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitrate boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const TWO_PASS *const twopass,
                                 const FIRSTPASS_STATS *frame) {
   double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
-  double modified_pct_inter;
-  double modified_pcnt_intra;
-  const double motion_amplitude_part =
-      frame->pcnt_motion *
-      ((frame->mvc_abs + frame->mvr_abs) /
-       (frame_info->frame_height + frame_info->frame_width));
-
-  modified_pct_inter = frame->pcnt_inter;
-  if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
-      ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
-       (double)NCOUNT_FRAME_II_THRESH)) {
-    modified_pct_inter =
-        frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral;
-  }
-  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
 
+  // Do nothing if the second ref to last frame error difference is
+  // very small or even negative.
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
-    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part -
-               (INTRA_PART * modified_pcnt_intra);
+    const double sr_diff_part =
+        twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error);
+    double modified_pct_inter = frame->pcnt_inter;
+    double modified_pcnt_intra;
+
+    if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+        ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+         (double)NCOUNT_FRAME_II_THRESH)) {
+      modified_pct_inter =
+          frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral;
+    }
+    modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+    sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
   }
-  return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT);
+  return VPXMAX(sr_decay, twopass->sr_default_decay_limit);
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const FRAME_INFO *frame_info,
+static double get_zero_motion_factor(const TWO_PASS *const twopass,
                                      const FIRSTPASS_STATS *frame_stats) {
   const double zero_motion_pct =
       frame_stats->pcnt_inter - frame_stats->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(frame_info, frame_stats);
+  double sr_decay = get_sr_decay_rate(twopass, frame_stats);
   return VPXMIN(sr_decay, zero_motion_pct);
 }
 
-#define ZM_POWER_FACTOR 0.75
-
-static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
+static double get_prediction_decay_rate(const TWO_PASS *const twopass,
                                         const FIRSTPASS_STATS *frame_stats) {
-  const double sr_decay_rate = get_sr_decay_rate(frame_info, frame_stats);
-  const double zero_motion_factor =
-      (0.95 * pow((frame_stats->pcnt_inter - frame_stats->pcnt_motion),
-                  ZM_POWER_FACTOR));
+  const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats);
+  double zero_motion_factor =
+      twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
+
+  // Check that the zero motion factor is valid
+  assert(zero_motion_factor >= 0.0 && zero_motion_factor <= 1.0);
 
   return VPXMAX(zero_motion_factor,
                 (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
@@ -1863,7 +1859,7 @@ static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) {
   // brief break in prediction (such as a flash) but subsequent frames
   // are reasonably well predicted by an earlier (pre flash) frame.
   // The recovery after a flash is indicated by a high pcnt_second_ref
-  // useage or a second ref coded error notabley lower than the last
+  // usage or a second ref coded error notabley lower than the last
   // frame coded error.
   if (frame_stats == NULL) {
     return 0;
@@ -1906,10 +1902,9 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
   }
 }
 
-#define BASELINE_ERR_PER_MB 12500.0
-#define GF_MAX_BOOST 96.0
 static double calc_frame_boost(const FRAME_INFO *frame_info,
                                const FIRSTPASS_STATS *this_frame,
+                               const TWO_PASS *const twopass,
                                int avg_frame_qindex,
                                double this_frame_mv_in_out) {
   double frame_boost;
@@ -1918,8 +1913,8 @@ static double calc_frame_boost(const FRAME_INFO *frame_info,
   const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
   const double active_area = calculate_active_area(frame_info, this_frame);
 
-  // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * active_area) /
+  // Frame booost is based on inter error.
+  frame_boost = (twopass->err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
 
   // Small adjustment for cases where there is a zoom out
@@ -1929,37 +1924,25 @@ static double calc_frame_boost(const FRAME_INFO *frame_info,
   // Q correction and scalling
   frame_boost = frame_boost * boost_q_correction;
 
-  return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction);
-}
-
-static double kf_err_per_mb(VP9_COMP *cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
-  unsigned int screen_area = (cm->width * cm->height);
-
-  // Use a different error per mb factor for calculating boost for
-  //  different formats.
-  if (screen_area < 1280 * 720) {
-    return 2000.0;
-  } else if (screen_area < 1920 * 1080) {
-    return 500.0;
-  }
-  return 250.0;
+  return VPXMIN(frame_boost, twopass->gf_frame_max_boost * boost_q_correction);
 }
 
 static double calc_kf_frame_boost(VP9_COMP *cpi,
                                   const FIRSTPASS_STATS *this_frame,
                                   double *sr_accumulator,
                                   double this_frame_mv_in_out,
-                                  double max_boost) {
+                                  double zm_factor) {
+  TWO_PASS *const twopass = &cpi->twopass;
   double frame_boost;
   const double lq = vp9_convert_qindex_to_q(
       cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
   const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
   const double active_area =
       calculate_active_area(&cpi->frame_info, this_frame);
+  double max_boost;
 
-  // Underlying boost factor is based on inter error ratio.
-  frame_boost = (kf_err_per_mb(cpi) * active_area) /
+  // Frame booost is based on inter error.
+  frame_boost = (twopass->kf_err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
 
   // Update the accumulator for second ref error difference.
@@ -1976,15 +1959,23 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
   // The 40.0 value here is an experimentally derived baseline minimum.
   // This value is in line with the minimum per frame boost in the alt_ref
   // boost calculation.
-  frame_boost = ((frame_boost + 40.0) * boost_q_correction);
+  frame_boost =
+      (frame_boost + twopass->kf_frame_min_boost) * boost_q_correction;
+
+  // Maximum allowed boost this frame. May be different for first vs subsequent
+  // key frames.
+  max_boost = (cpi->common.current_video_frame == 0)
+                  ? twopass->kf_frame_max_boost_first
+                  : twopass->kf_frame_max_boost_subs;
+  max_boost *= zm_factor * boost_q_correction;
 
-  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+  return VPXMIN(frame_boost, max_boost);
 }
 
 static int compute_arf_boost(const FRAME_INFO *frame_info,
-                             const FIRST_PASS_INFO *first_pass_info,
-                             int arf_show_idx, int f_frames, int b_frames,
-                             int avg_frame_qindex) {
+                             TWO_PASS *const twopass, int arf_show_idx,
+                             int f_frames, int b_frames, int avg_frame_qindex) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int i;
   double boost_score = 0.0;
   double mv_ratio_accumulator = 0.0;
@@ -2015,14 +2006,14 @@ static int compute_arf_boost(const FRAME_INFO *frame_info,
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(frame_info, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-    boost_score += decay_accumulator * calc_frame_boost(frame_info, this_frame,
-                                                        avg_frame_qindex,
-                                                        this_frame_mv_in_out);
+    boost_score += decay_accumulator *
+                   calc_frame_boost(frame_info, this_frame, twopass,
+                                    avg_frame_qindex, this_frame_mv_in_out);
   }
 
   arf_boost = (int)boost_score;
@@ -2048,21 +2039,21 @@ static int compute_arf_boost(const FRAME_INFO *frame_info,
         this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
-    // We want to discount the the flash frame itself and the recovery
+    // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
     flash_detected = detect_flash_from_frame_stats(this_frame) ||
                      detect_flash_from_frame_stats(next_frame);
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(frame_info, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-    boost_score += decay_accumulator * calc_frame_boost(frame_info, this_frame,
-                                                        avg_frame_qindex,
-                                                        this_frame_mv_in_out);
+    boost_score += decay_accumulator *
+                   calc_frame_boost(frame_info, this_frame, twopass,
+                                    avg_frame_qindex, this_frame_mv_in_out);
   }
   arf_boost += (int)boost_score;
 
@@ -2078,8 +2069,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) {
   TWO_PASS *const twopass = &cpi->twopass;
   const int avg_inter_frame_qindex = cpi->rc.avg_frame_qindex[INTER_FRAME];
   int arf_show_idx = get_show_idx(twopass);
-  return compute_arf_boost(frame_info, &twopass->first_pass_info, arf_show_idx,
-                           f_frames, b_frames, avg_inter_frame_qindex);
+  return compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames,
+                           b_frames, avg_inter_frame_qindex);
 }
 
 // Calculate a section intra ratio used in setting max loop filter.
@@ -2134,11 +2125,10 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
   }
 
   // Clamp odd edge cases.
-  total_group_bits = (total_group_bits < 0)
-                         ? 0
-                         : (total_group_bits > twopass->kf_group_bits)
-                               ? twopass->kf_group_bits
-                               : total_group_bits;
+  total_group_bits = (total_group_bits < 0) ? 0
+                     : (total_group_bits > twopass->kf_group_bits)
+                         ? twopass->kf_group_bits
+                         : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
   if (total_group_bits > (int64_t)max_bits * gop_frames)
@@ -2180,7 +2170,7 @@ static double calculate_group_score(VP9_COMP *cpi, double av_score,
   double score_total = 0.0;
   int i = 0;
 
-  // We dont ever want to return a 0 score here.
+  // We don't ever want to return a 0 score here.
   if (frame_count == 0) return 1.0;
 
   while ((i < frame_count) && (s < twopass->stats_in_end)) {
@@ -2494,6 +2484,9 @@ typedef struct RANGE {
  * (The following fields will remain unchanged after initialization of encoder.)
  *   rc->static_scene_max_gf_interval
  *   rc->min_gf_interval
+ *   twopass->sr_diff_factor
+ *   twopass->sr_default_decay_limit
+ *   twopass->zm_factor
  *
  * Dynamic fields:
  * (The following fields will be updated before or after coding each frame.)
@@ -2509,9 +2502,10 @@ typedef struct RANGE {
  */
 static int get_gop_coding_frame_num(
     int *use_alt_ref, const FRAME_INFO *frame_info,
-    const FIRST_PASS_INFO *first_pass_info, const RATE_CONTROL *rc,
+    const TWO_PASS *const twopass, const RATE_CONTROL *rc,
     int gf_start_show_idx, const RANGE *active_gf_interval,
-    double gop_intra_factor, int lag_in_frames) {
+    double gop_intra_factor, int lag_in_frames, int *end_of_sequence) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   double loop_decay_rate = 1.00;
   double mv_ratio_accumulator = 0.0;
   double this_frame_mv_in_out = 0.0;
@@ -2536,6 +2530,7 @@ static int get_gop_coding_frame_num(
     next_frame = fps_get_frame_stats(first_pass_info,
                                      gf_start_show_idx + gop_coding_frames);
     if (next_frame == NULL) {
+      *end_of_sequence = gop_coding_frames == 1 && rc->source_alt_ref_active;
       break;
     }
 
@@ -2552,15 +2547,14 @@ static int get_gop_coding_frame_num(
 
     // Monitor for static sections.
     if ((rc->frames_since_key + gop_coding_frames - 1) > 1) {
-      zero_motion_accumulator =
-          VPXMIN(zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, next_frame));
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(twopass, next_frame));
     }
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       double last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(frame_info, next_frame);
+      loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -2605,7 +2599,7 @@ static int get_gop_coding_frame_num(
     if (
         // Don't break out with a very short interval.
         (gop_coding_frames >= active_gf_interval->min) &&
-        // If possible dont break very close to a kf
+        // If possible don't break very close to a kf
         ((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) &&
         (gop_coding_frames & 0x01) && (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
@@ -2620,25 +2614,25 @@ static int get_gop_coding_frame_num(
   return gop_coding_frames;
 }
 
-static RANGE get_active_gf_inverval_range(
-    const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf,
-    int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) {
+static RANGE get_active_gf_inverval_range_simple(int min_gf_interval,
+                                                 int arf_active_or_kf,
+                                                 int frames_to_key) {
   RANGE active_gf_interval;
-#if CONFIG_RATE_CTRL
-  (void)frame_info;
-  (void)gf_start_show_idx;
-  (void)active_worst_quality;
-  (void)last_boosted_qindex;
-  active_gf_interval.min = rc->min_gf_interval + arf_active_or_kf + 2;
-
+  active_gf_interval.min = min_gf_interval + arf_active_or_kf + 2;
   active_gf_interval.max = 16 + arf_active_or_kf;
 
-  if ((active_gf_interval.max <= rc->frames_to_key) &&
-      (active_gf_interval.max >= (rc->frames_to_key - rc->min_gf_interval))) {
-    active_gf_interval.min = rc->frames_to_key / 2;
-    active_gf_interval.max = rc->frames_to_key / 2;
+  if ((active_gf_interval.max <= frames_to_key) &&
+      (active_gf_interval.max >= (frames_to_key - min_gf_interval))) {
+    active_gf_interval.min = frames_to_key / 2;
+    active_gf_interval.max = frames_to_key / 2;
   }
-#else
+  return active_gf_interval;
+}
+
+static RANGE get_active_gf_inverval_range(
+    const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf,
+    int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) {
+  RANGE active_gf_interval;
   int int_max_q = (int)(vp9_convert_qindex_to_q(active_worst_quality,
                                                 frame_info->bit_depth));
   int q_term = (gf_start_show_idx == 0)
@@ -2676,7 +2670,6 @@ static RANGE get_active_gf_inverval_range(
   }
   active_gf_interval.max =
       VPXMAX(active_gf_interval.max, active_gf_interval.min);
-#endif
   return active_gf_interval;
 }
 
@@ -2728,18 +2721,28 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   double gop_intra_factor;
   int gop_frames;
   RANGE active_gf_interval;
+  // Whether this is at the end of last GOP of this sequence.
+  int end_of_sequence = 0;
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
   if (is_key_frame == 0) {
     vp9_zero(twopass->gf_group);
+    ++rc->gop_global_index;
+  } else {
+    rc->gop_global_index = 0;
   }
 
   vpx_clear_system_state();
 
-  active_gf_interval = get_active_gf_inverval_range(
-      frame_info, rc, arf_active_or_kf, gf_start_show_idx,
-      twopass->active_worst_quality, rc->last_boosted_qindex);
+  if (oxcf->use_simple_encode_api) {
+    active_gf_interval = get_active_gf_inverval_range_simple(
+        rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
+  } else {
+    active_gf_interval = get_active_gf_inverval_range(
+        frame_info, rc, arf_active_or_kf, gf_start_show_idx,
+        twopass->active_worst_quality, rc->last_boosted_qindex);
+  }
 
   if (cpi->multi_layer_arf) {
     int arf_layers = get_arf_layers(cpi->multi_layer_arf, oxcf->enable_auto_arf,
@@ -2749,26 +2752,55 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     gop_intra_factor = 1.0;
   }
 
+  gop_coding_frames = get_gop_coding_frame_num(
+      &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
+      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames,
+      &end_of_sequence);
+  use_alt_ref &= allow_alt_ref;
 #if CONFIG_RATE_CTRL
-  {
+  // If the external gop_command is on, we will override the decisions
+  // of gop_coding_frames and use_alt_ref.
+  if (cpi->oxcf.use_simple_encode_api) {
     const GOP_COMMAND *gop_command = &cpi->encode_command.gop_command;
     assert(allow_alt_ref == 1);
     if (gop_command->use) {
       gop_coding_frames = gop_command_coding_frame_count(gop_command);
       use_alt_ref = gop_command->use_alt_ref;
-    } else {
-      gop_coding_frames = get_gop_coding_frame_num(
-          &use_alt_ref, frame_info, first_pass_info, rc, gf_start_show_idx,
-          &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
-      use_alt_ref &= allow_alt_ref;
     }
   }
-#else
-  gop_coding_frames = get_gop_coding_frame_num(
-      &use_alt_ref, frame_info, first_pass_info, rc, gf_start_show_idx,
-      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
-  use_alt_ref &= allow_alt_ref;
 #endif
+  // If the external rate control model for GOP is used, the gop decisions
+  // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
+  // will be overwritten.
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) {
+    vpx_codec_err_t codec_status;
+    vpx_rc_gop_decision_t gop_decision;
+    vpx_rc_gop_info_t gop_info;
+    gop_info.min_gf_interval = rc->min_gf_interval;
+    gop_info.max_gf_interval = rc->max_gf_interval;
+    gop_info.active_min_gf_interval = active_gf_interval.min;
+    gop_info.active_max_gf_interval = active_gf_interval.max;
+    gop_info.allow_alt_ref = allow_alt_ref;
+    gop_info.is_key_frame = is_key_frame;
+    gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
+    gop_info.frames_since_key = rc->frames_since_key;
+    gop_info.frames_to_key = rc->frames_to_key;
+    gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
+    gop_info.show_index = cm->current_video_frame;
+    gop_info.coding_index = cm->current_frame_coding_index;
+    gop_info.gop_global_index = rc->gop_global_index;
+
+    codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
+                                              &gop_decision);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_gop_decision() failed");
+    }
+    gop_coding_frames = gop_decision.gop_coding_frames;
+    use_alt_ref = gop_decision.use_alt_ref;
+  }
 
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
@@ -2787,8 +2819,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
 
     // Calculate the boost for alt ref.
     rc->gfu_boost =
-        compute_arf_boost(frame_info, first_pass_info, arf_show_idx, f_frames,
-                          b_frames, avg_inter_frame_qindex);
+        compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, b_frames,
+                          avg_inter_frame_qindex);
     rc->source_alt_ref_pending = 1;
   } else {
     const int f_frames = gop_coding_frames - 1;
@@ -2798,9 +2830,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     const int gld_show_idx =
         VPXMIN(gf_start_show_idx + 1, fps_get_num_frames(first_pass_info));
     const int arf_boost =
-        compute_arf_boost(frame_info, first_pass_info, gld_show_idx, f_frames,
-                          b_frames, avg_inter_frame_qindex);
-    rc->gfu_boost = VPXMIN(MAX_GF_BOOST, arf_boost);
+        compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames,
+                          avg_inter_frame_qindex);
+    rc->gfu_boost = VPXMIN((int)twopass->gf_max_total_boost, arf_boost);
     rc->source_alt_ref_pending = 0;
   }
 
@@ -2903,7 +2935,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
         cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
         group_av_noise, vbr_group_bits_per_frame);
     twopass->active_worst_quality =
-        (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
+        (int)((tmp_q + (twopass->active_worst_quality *
+                        (twopass->active_wq_factor - 1))) /
+              twopass->active_wq_factor);
 
 #if CONFIG_ALWAYS_ADJUST_BPM
     // Reset rolling actual and target bits counters for ARF groups.
@@ -3003,7 +3037,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
       next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error);
 
   // Return true the intra/inter ratio for the current frame is
-  // low but better in the next and previous frame and the relative useage of
+  // low but better in the next and previous frame and the relative usage of
   // intra in the current frame is markedly higher than the last and next frame.
   if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) &&
       (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) &&
@@ -3024,8 +3058,8 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
 // Minimum % intra coding observed in first pass (1.0 = 100%)
 #define MIN_INTRA_LEVEL 0.25
 // Threshold for use of the lagging second reference frame. Scene cuts do not
-// usually have a high second ref useage.
-#define SECOND_REF_USEAGE_THRESH 0.2
+// usually have a high second ref usage.
+#define SECOND_REF_USAGE_THRESH 0.2
 // Hard threshold where the first pass chooses intra for almost all blocks.
 // In such a case even if the frame is not a scene cut coding a key frame
 // may be a good option.
@@ -3055,7 +3089,7 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
   detect_flash_from_frame_stats(next_frame);
   if (!detect_flash_from_frame_stats(this_frame) &&
       !detect_flash_from_frame_stats(next_frame) &&
-      (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      (this_frame->pcnt_second_ref < SECOND_REF_USAGE_THRESH) &&
       ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
        (slide_transition(this_frame, last_frame, next_frame)) ||
        (intra_step_transition(this_frame, last_frame, next_frame)) ||
@@ -3123,18 +3157,10 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
 #define MIN_SCAN_FRAMES_FOR_KF_BOOST 32
 #define KF_ABS_ZOOM_THRESH 6.0
 
-#ifdef AGGRESSIVE_VBR
-#define KF_MAX_FRAME_BOOST 80.0
-#define MAX_KF_TOT_BOOST 4800
-#else
-#define KF_MAX_FRAME_BOOST 96.0
-#define MAX_KF_TOT_BOOST 5400
-#endif
-
 int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
-                               const FRAME_INFO *frame_info,
-                               const FIRST_PASS_INFO *first_pass_info,
-                               int kf_show_idx, int min_gf_interval) {
+                               const TWO_PASS *const twopass, int kf_show_idx,
+                               int min_gf_interval) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
   int j;
   int frames_to_key;
@@ -3161,7 +3187,7 @@ int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
           break;
 
         // How fast is the prediction quality decaying?
-        loop_decay_rate = get_prediction_decay_rate(frame_info, next_frame);
+        loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
 
         // We want to know something about the recent past... rather than
         // as used elsewhere where we are concerned with decay in prediction
@@ -3247,8 +3273,8 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
   kf_mod_err = calc_norm_frame_score(oxcf, frame_info, keyframe_stats,
                                      mean_mod_score, av_err);
 
-  rc->frames_to_key = vp9_get_frames_to_next_key(
-      oxcf, frame_info, first_pass_info, kf_show_idx, rc->min_gf_interval);
+  rc->frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, kf_show_idx,
+                                                 rc->min_gf_interval);
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
@@ -3330,7 +3356,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
       if (i > 0) {
         zero_motion_accumulator =
             VPXMIN(zero_motion_accumulator,
-                   get_zero_motion_factor(&cpi->frame_info, &next_frame));
+                   get_zero_motion_factor(twopass, &next_frame));
       } else {
         zero_motion_accumulator =
             next_frame.pcnt_inter - next_frame.pcnt_motion;
@@ -3341,11 +3367,11 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
 
       // The second (lagging) ref error is not valid immediately after
       // a key frame because either the lag has not built up (in the case of
-      // the first key frame or it points to a refernce before the new key
+      // the first key frame or it points to a reference before the new key
       // frame.
       if (i < 2) sr_accumulator = 0.0;
-      frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0,
-                                        KF_MAX_FRAME_BOOST * zm_factor);
+      frame_boost =
+          calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, zm_factor);
 
       boost_score += frame_boost;
 
@@ -3371,15 +3397,15 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
   twopass->key_frame_section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
-  // Special case for static / slide show content but dont apply
+  // Special case for static / slide show content but don't apply
   // if the kf group is very short.
   if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
-    rc->kf_boost = MAX_KF_TOT_BOOST;
+    rc->kf_boost = (int)(twopass->kf_max_total_boost);
   } else {
-    // Apply various clamps for min and max boost
+    // Apply various clamps for min and max oost
     rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
     rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-    rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+    rc->kf_boost = VPXMIN(rc->kf_boost, (int)(twopass->kf_max_total_boost));
   }
 
   // Work out how many bits to allocate for the key frame itself.
@@ -3415,23 +3441,66 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
   }
 }
 
-static int is_skippable_frame(const VP9_COMP *cpi) {
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-  const TWO_PASS *const twopass = &cpi->twopass;
-
-  return (!frame_is_intra_only(&cpi->common) &&
-          twopass->stats_in - 2 > twopass->stats_in_start &&
-          twopass->stats_in < twopass->stats_in_end &&
-          (twopass->stats_in - 1)->pcnt_inter -
-                  (twopass->stats_in - 1)->pcnt_motion ==
-              1 &&
-          (twopass->stats_in - 2)->pcnt_inter -
-                  (twopass->stats_in - 2)->pcnt_motion ==
-              1 &&
-          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+// Configure image size specific vizier parameters.
+// Later these will be set via additional command line options
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) {
+  // When |use_vizier_rc_params| is 1, we expect the rc parameters below to
+  // have been initialised on the command line as adjustment factors such
+  // that a factor of 1.0 will match the default behavior when
+  // |use_vizier_rc_params| is 0
+  if (twopass->use_vizier_rc_params) {
+    twopass->active_wq_factor *= AV_WQ_FACTOR;
+    twopass->err_per_mb *= BASELINE_ERR_PER_MB;
+    twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT;
+    if (twopass->sr_default_decay_limit > 1.0)  // > 1.0 here makes no sense
+      twopass->sr_default_decay_limit = 1.0;
+    twopass->sr_diff_factor *= 1.0;
+    twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST;
+    twopass->gf_max_total_boost *= MAX_GF_BOOST;
+    // NOTE: In use max boost has precedence over min boost. So even if min is
+    // somehow set higher than max the final boost value will be clamped to the
+    // appropriate maximum.
+    twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST;
+    twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST;
+    twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST;
+    twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST;
+    twopass->zm_factor *= DEFAULT_ZM_FACTOR;
+    if (twopass->zm_factor > 1.0)  // > 1.0 here makes no sense
+      twopass->zm_factor = 1.0;
+
+    // Correction for the fact that the kf_err_per_mb_factor default is
+    // already different for different video formats and ensures that a passed
+    // in value of 1.0 on the vizier command line will still match the current
+    // default.
+    if (screen_area < 1280 * 720) {
+      twopass->kf_err_per_mb *= 2000.0;
+    } else if (screen_area < 1920 * 1080) {
+      twopass->kf_err_per_mb *= 500.0;
+    } else {
+      twopass->kf_err_per_mb *= 250.0;
+    }
+  } else {
+    // When |use_vizier_rc_params| is 0, use defaults.
+    twopass->active_wq_factor = AV_WQ_FACTOR;
+    twopass->err_per_mb = BASELINE_ERR_PER_MB;
+    twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+    twopass->sr_diff_factor = 1.0;
+    twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+    twopass->gf_max_total_boost = MAX_GF_BOOST;
+    twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
+    twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+    twopass->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST;
+    twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+    twopass->zm_factor = DEFAULT_ZM_FACTOR;
+
+    if (screen_area < 1280 * 720) {
+      twopass->kf_err_per_mb = 2000.0;
+    } else if (screen_area < 1920 * 1080) {
+      twopass->kf_err_per_mb = 500.0;
+    } else {
+      twopass->kf_err_per_mb = 250.0;
+    }
+  }
 }
 
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
@@ -3442,9 +3511,26 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   const int show_idx = cm->current_video_frame;
 
+  if (cpi->common.current_frame_coding_index == 0 &&
+      cpi->ext_ratectrl.funcs.send_firstpass_stats != NULL) {
+    const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
+        &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_send_firstpass_stats() failed");
+    }
+  }
+
   if (!twopass->stats_in) return;
 
-  // If this is an arf frame then we dont want to read the stats file or
+  // Configure image size specific vizier parameters
+  if (cm->current_video_frame == 0) {
+    unsigned int screen_area = (cm->width * cm->height);
+
+    vp9_init_vizier_params(twopass, screen_area);
+  }
+
+  // If this is an arf frame then we don't want to read the stats file or
   // advance the input pointer as we already have what we need.
   if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
     int target_rate;
@@ -3462,13 +3548,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
     cm->frame_type = INTER_FRAME;
 
-    // Do the firstpass stats indicate that this frame is skippable for the
-    // partition search?
-    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-        !cpi->use_svc) {
-      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-    }
-
     // The multiplication by 256 reverses a scaling factor of (>> 8)
     // applied when combining MB error values for the frame.
     twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
@@ -3551,13 +3630,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   vp9_configure_buffer_updates(cpi, gf_group->index);
 
-  // Do the firstpass stats indicate that this frame is skippable for the
-  // partition search?
-  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-      !cpi->use_svc) {
-    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-  }
-
   rc->base_frame_target = gf_group->bit_allocation[gf_group->index];
 
   // The multiplication by 256 reverses a scaling factor of (>> 8)
@@ -3569,9 +3641,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
-#define MINQ_ADJ_LIMIT 48
-#define MINQ_ADJ_LIMIT_CQ 20
-#define HIGH_UNDERSHOOT_RATIO 2
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -3711,8 +3780,7 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame,
   *first_is_key_frame = 0;
   if (rc.frames_to_key == 0) {
     rc.frames_to_key = vp9_get_frames_to_next_key(
-        &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info,
-        *first_show_idx, rc.min_gf_interval);
+        &cpi->oxcf, &cpi->twopass, *first_show_idx, rc.min_gf_interval);
     rc.frames_since_key = 0;
     *first_is_key_frame = 1;
   }
@@ -3720,18 +3788,18 @@ void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame,
   if (gop_command->use) {
     *coding_frame_count = gop_command_coding_frame_count(gop_command);
     *use_alt_ref = gop_command->use_alt_ref;
-    assert(*coding_frame_count < rc.frames_to_key);
+    assert(gop_command->show_frame_count <= rc.frames_to_key);
   } else {
     *coding_frame_count = vp9_get_gop_coding_frame_count(
-        &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info, &rc,
-        *first_show_idx, multi_layer_arf, allow_alt_ref, *first_is_key_frame,
+        &cpi->oxcf, &cpi->twopass, &cpi->frame_info, &rc, *first_show_idx,
+        multi_layer_arf, allow_alt_ref, *first_is_key_frame,
         *last_gop_use_alt_ref, use_alt_ref);
   }
 }
 
 int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
+                                   const TWO_PASS *const twopass,
                                    const FRAME_INFO *frame_info,
-                                   const FIRST_PASS_INFO *first_pass_info,
                                    const RATE_CONTROL *rc, int show_idx,
                                    int multi_layer_arf, int allow_alt_ref,
                                    int first_is_key_frame,
@@ -3739,12 +3807,20 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
   int frame_count;
   double gop_intra_factor;
   const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame;
-  RANGE active_gf_interval = get_active_gf_inverval_range(
-      frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0,
-      /*last_boosted_qindex=*/0);
+  RANGE active_gf_interval;
+  int arf_layers;
+  int end_of_sequence = 0;
+  if (oxcf->use_simple_encode_api) {
+    active_gf_interval = get_active_gf_inverval_range_simple(
+        rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
+  } else {
+    active_gf_interval = get_active_gf_inverval_range(
+        frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0,
+        /*last_boosted_qindex=*/0);
+  }
 
-  const int arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf,
-                                        active_gf_interval.max);
+  arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf,
+                              active_gf_interval.max);
   if (multi_layer_arf) {
     gop_intra_factor = 1.0 + 0.25 * arf_layers;
   } else {
@@ -3752,8 +3828,8 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
   }
 
   frame_count = get_gop_coding_frame_num(
-      use_alt_ref, frame_info, first_pass_info, rc, show_idx,
-      &active_gf_interval, gop_intra_factor, oxcf->lag_in_frames);
+      use_alt_ref, frame_info, twopass, rc, show_idx, &active_gf_interval,
+      gop_intra_factor, oxcf->lag_in_frames, &end_of_sequence);
   *use_alt_ref &= allow_alt_ref;
   return frame_count;
 }
@@ -3761,9 +3837,10 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
 // Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of
 // coding frames (including show frame and alt ref) can be determined.
 int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
-                             const FRAME_INFO *frame_info,
-                             const FIRST_PASS_INFO *first_pass_info,
-                             int multi_layer_arf, int allow_alt_ref) {
+                             const TWO_PASS *const twopass,
+                             const FRAME_INFO *frame_info, int multi_layer_arf,
+                             int allow_alt_ref) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int coding_frame_num = 0;
   RATE_CONTROL rc;
   int gop_coding_frame_count;
@@ -3776,14 +3853,14 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
     int use_alt_ref;
     int first_is_key_frame = 0;
     if (rc.frames_to_key == 0) {
-      rc.frames_to_key = vp9_get_frames_to_next_key(
-          oxcf, frame_info, first_pass_info, show_idx, rc.min_gf_interval);
+      rc.frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, show_idx,
+                                                    rc.min_gf_interval);
       rc.frames_since_key = 0;
       first_is_key_frame = 1;
     }
 
     gop_coding_frame_count = vp9_get_gop_coding_frame_count(
-        oxcf, frame_info, first_pass_info, &rc, show_idx, multi_layer_arf,
+        oxcf, twopass, frame_info, &rc, show_idx, multi_layer_arf,
         allow_alt_ref, first_is_key_frame, last_gop_use_alt_ref, &use_alt_ref);
 
     rc.source_alt_ref_active = use_alt_ref;
@@ -3798,9 +3875,8 @@ int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
 }
 
 void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf,
-                           const FRAME_INFO *frame_info,
-                           const FIRST_PASS_INFO *first_pass_info,
-                           int *key_frame_map) {
+                           const TWO_PASS *const twopass, int *key_frame_map) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int show_idx = 0;
   RATE_CONTROL rc;
   vp9_rc_init(oxcf, 1, &rc);
@@ -3813,8 +3889,8 @@ void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf,
   while (show_idx < first_pass_info->num_frames) {
     int key_frame_group_size;
     key_frame_map[show_idx] = 1;
-    key_frame_group_size = vp9_get_frames_to_next_key(
-        oxcf, frame_info, first_pass_info, show_idx, rc.min_gf_interval);
+    key_frame_group_size =
+        vp9_get_frames_to_next_key(oxcf, twopass, show_idx, rc.min_gf_interval);
     assert(key_frame_group_size > 0);
     show_idx += key_frame_group_size;
   }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index b1047eab22..a19b04db74 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -14,6 +14,7 @@
 #include <assert.h>
 
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/encoder/vp9_firstpass_stats.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 
@@ -21,27 +22,6 @@
 extern "C" {
 #endif
 
-#if CONFIG_FP_MB_STATS
-
-#define FPMB_DCINTRA_MASK 0x01
-
-#define FPMB_MOTION_ZERO_MASK 0x02
-#define FPMB_MOTION_LEFT_MASK 0x04
-#define FPMB_MOTION_RIGHT_MASK 0x08
-#define FPMB_MOTION_UP_MASK 0x10
-#define FPMB_MOTION_DOWN_MASK 0x20
-
-#define FPMB_ERROR_SMALL_MASK 0x40
-#define FPMB_ERROR_LARGE_MASK 0x80
-#define FPMB_ERROR_SMALL_TH 2000
-#define FPMB_ERROR_LARGE_TH 48000
-
-typedef struct {
-  uint8_t *mb_stats_start;
-  uint8_t *mb_stats_end;
-} FIRSTPASS_MB_STATS;
-#endif
-
 #define INVALID_ROW (-1)
 
 #define MAX_ARF_LAYERS 6
@@ -76,37 +56,9 @@ typedef struct {
   int64_t sum_mvcs;
   int sum_in_vectors;
   int intra_smooth_count;
+  int new_mv_count;
 } FIRSTPASS_DATA;
 
-typedef struct {
-  double frame;
-  double weight;
-  double intra_error;
-  double coded_error;
-  double sr_coded_error;
-  double frame_noise_energy;
-  double pcnt_inter;
-  double pcnt_motion;
-  double pcnt_second_ref;
-  double pcnt_neutral;
-  double pcnt_intra_low;   // Coded intra but low variance
-  double pcnt_intra_high;  // Coded intra high variance
-  double intra_skip_pct;
-  double intra_smooth_pct;    // % of blocks that are smooth
-  double inactive_zone_rows;  // Image mask rows top and bottom.
-  double inactive_zone_cols;  // Image mask columns at left and right edges.
-  double MVr;
-  double mvr_abs;
-  double MVc;
-  double mvc_abs;
-  double MVrv;
-  double MVcv;
-  double mv_in_out_count;
-  double duration;
-  double count;
-  int64_t spatial_layer_id;
-} FIRSTPASS_STATS;
-
 typedef enum {
   KF_UPDATE = 0,
   LF_UPDATE = 1,
@@ -188,12 +140,6 @@ typedef struct {
   double mb_av_energy;
   double mb_smooth_pct;
 
-#if CONFIG_FP_MB_STATS
-  uint8_t *frame_mb_stats_buf;
-  uint8_t *this_frame_mb_stats;
-  FIRSTPASS_MB_STATS firstpass_mb_stats;
-#endif
-
   FP_MB_FLOAT_STATS *fp_mb_float_stats;
 
   // An indication of the content type of the current frame
@@ -221,6 +167,24 @@ typedef struct {
   int last_qindex_of_arf_layer[MAX_ARF_LAYERS];
 
   GF_GROUP gf_group;
+
+  // Vizeir project experimental two pass rate control parameters.
+  // When |use_vizier_rc_params| is 1, the following parameters will
+  // be overwritten by pass in values. Otherwise, they are initialized
+  // by default values.
+  int use_vizier_rc_params;
+  double active_wq_factor;
+  double err_per_mb;
+  double sr_default_decay_limit;
+  double sr_diff_factor;
+  double kf_err_per_mb;
+  double kf_frame_min_boost;
+  double kf_frame_max_boost_first;  // Max for first kf in a chunk.
+  double kf_frame_max_boost_subs;   // Max for subsequent mid chunk kfs.
+  double kf_max_total_boost;
+  double gf_max_total_boost;
+  double gf_frame_max_boost;
+  double zm_factor;
 } TWO_PASS;
 
 struct VP9_COMP;
@@ -239,6 +203,7 @@ void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,
 
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area);
 
 // Post encode update of the rate control parameters for 2-pass
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
@@ -248,9 +213,8 @@ void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width,
 
 struct VP9EncoderConfig;
 int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf,
-                               const FRAME_INFO *frame_info,
-                               const FIRST_PASS_INFO *first_pass_info,
-                               int kf_show_idx, int min_gf_interval);
+                               const TWO_PASS *const twopass, int kf_show_idx,
+                               int min_gf_interval);
 #if CONFIG_RATE_CTRL
 /* Call this function to get info about the next group of pictures.
  * This function should be called after vp9_create_compressor() when encoding
@@ -265,8 +229,8 @@ void vp9_get_next_group_of_picture(const struct VP9_COMP *cpi,
 /*!\brief Call this function before coding a new group of pictures to get
  * information about it.
  * \param[in] oxcf                 Encoder config
+ * \param[in] twopass              Twopass info
  * \param[in] frame_info           Frame info
- * \param[in] first_pass_info      First pass stats
  * \param[in] rc                   Rate control state
  * \param[in] show_idx             Show index of the first frame in the group
  * \param[in] multi_layer_arf      Is multi-layer alternate reference used
@@ -279,27 +243,25 @@ void vp9_get_next_group_of_picture(const struct VP9_COMP *cpi,
  * \return Returns coding frame count
  */
 int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf,
+                                   const TWO_PASS *const twopass,
                                    const FRAME_INFO *frame_info,
-                                   const FIRST_PASS_INFO *first_pass_info,
                                    const RATE_CONTROL *rc, int show_idx,
                                    int multi_layer_arf, int allow_alt_ref,
                                    int first_is_key_frame,
                                    int last_gop_use_alt_ref, int *use_alt_ref);
 
 int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf,
-                             const FRAME_INFO *frame_info,
-                             const FIRST_PASS_INFO *first_pass_info,
-                             int multi_layer_arf, int allow_alt_ref);
+                             const TWO_PASS *const twopass,
+                             const FRAME_INFO *frame_info, int multi_layer_arf,
+                             int allow_alt_ref);
 
 /*!\brief Compute a key frame binary map indicates whether key frames appear
  * in the corresponding positions. The passed in key_frame_map must point to an
- * integer array with length equal to first_pass_info->num_frames, which is the
- * number of show frames in the video.
+ * integer array with length equal to twopass->first_pass_info.num_frames,
+ * which is the number of show frames in the video.
  */
 void vp9_get_key_frame_map(const struct VP9EncoderConfig *oxcf,
-                           const FRAME_INFO *frame_info,
-                           const FIRST_PASS_INFO *first_pass_info,
-                           int *key_frame_map);
+                           const TWO_PASS *const twopass, int *key_frame_map);
 #endif  // CONFIG_RATE_CTRL
 
 FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass);
diff --git a/vp9/encoder/vp9_firstpass_stats.h b/vp9/encoder/vp9_firstpass_stats.h
new file mode 100644
index 0000000000..01928e7816
--- /dev/null
+++ b/vp9/encoder/vp9_firstpass_stats.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+#define VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double frame;
+  double weight;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double frame_noise_energy;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double pcnt_intra_low;   // Coded intra but low variance
+  double pcnt_intra_high;  // Coded intra high variance
+  double intra_skip_pct;
+  double intra_smooth_pct;    // % of blocks that are smooth
+  double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double duration;
+  double count;
+  double new_mv_count;
+  int64_t spatial_layer_id;
+} FIRSTPASS_STATS;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c
index a410d0407f..c74d523246 100644
--- a/vp9/encoder/vp9_frame_scale.c
+++ b/vp9/encoder/vp9_frame_scale.c
@@ -9,9 +9,12 @@
  */
 
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx/vpx_codec.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_scale/yv12config.h"
 
@@ -91,6 +94,23 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
   {
     const int dst_w = dst->y_crop_width;
     const int dst_h = dst->y_crop_height;
+
+    // The issue b/311394513 reveals a corner case bug. vpx_scaled_2d() requires
+    // both x_step_q4 and y_step_q4 are less than or equal to 64. Otherwise, it
+    // needs to call vp9_scale_and_extend_frame_nonnormative() that supports
+    // arbitrary scaling.
+    const int x_step_q4 = 16 * src_w / dst_w;
+    const int y_step_q4 = 16 * src_h / dst_h;
+    if (x_step_q4 > 64 || y_step_q4 > 64) {
+      // This function is only called while cm->bit_depth is VPX_BITS_8.
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8);
+#else
+      vp9_scale_and_extend_frame_nonnormative(src, dst);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      return;
+    }
+
     for (i = 0; i < MAX_MB_PLANE; ++i) {
       const int factor = (i == 0 || i == 3 ? 1 : 2);
       const int src_stride = src_strides[i];
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index dbbe3af584..6ac6736673 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -82,15 +82,11 @@ int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx);
  * This function will copy the source image into a new framebuffer with
  * the expected stride/border.
  *
- * If active_map is non-NULL and there is only one frame in the queue, then copy
- * only active macroblocks.
- *
  * \param[in] ctx         Pointer to the lookahead context
  * \param[in] src         Pointer to the image to enqueue
  * \param[in] ts_start    Timestamp for the start of this frame
  * \param[in] ts_end      Timestamp for the end of this frame
  * \param[in] flags       Flags set on this frame
- * \param[in] active_map  Map that specifies which macroblock is active
  */
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end, int use_highbitdepth,
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 831c79c175..2f20a8fe6d 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -98,8 +98,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
   if (ref_mv->row != 0 || ref_mv->col != 0) {
-    unsigned int tmp_err;
-    MV zero_ref_mv = { 0, 0 }, tmp_mv;
+    MV zero_ref_mv = { 0, 0 };
 
     tmp_err =
         do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col);
@@ -219,7 +218,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
 
   int mb_col, mb_row, offset = 0;
-  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+  int mb_y_offset = 0;
   MV gld_top_mv = { 0, 0 };
   MODE_INFO mi_local;
   MODE_INFO mi_above, mi_left;
@@ -238,13 +237,11 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   xd->mi[0] = &mi_local;
   mi_local.sb_type = BLOCK_16X16;
   mi_local.ref_frame[0] = LAST_FRAME;
-  mi_local.ref_frame[1] = NONE;
+  mi_local.ref_frame[1] = NO_REF_FRAME;
 
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     MV gld_left_mv = gld_top_mv;
     int mb_y_in_offset = mb_y_offset;
-    int arf_y_in_offset = arf_y_offset;
-    int gld_y_in_offset = gld_y_offset;
 
     // Set up limit values for motion vectors to prevent them extending outside
     // the UMV borders.
@@ -266,8 +263,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
       xd->left_mi = &mi_left;
 
       mb_y_in_offset += 16;
-      gld_y_in_offset += 16;
-      arf_y_in_offset += 16;
       x->mv_limits.col_min -= 16;
       x->mv_limits.col_max -= 16;
     }
@@ -276,8 +271,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
     xd->above_mi = &mi_above;
 
     mb_y_offset += buf->y_stride * 16;
-    gld_y_offset += golden_ref->y_stride * 16;
-    if (alt_ref) arf_y_offset += alt_ref->y_stride * 16;
     x->mv_limits.row_min -= 16;
     x->mv_limits.row_max -= 16;
     offset += cm->mb_cols;
@@ -295,7 +288,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
   int *arf_not_zz;
 
   CHECK_MEM_ERROR(
-      cm, arf_not_zz,
+      &cm->error, arf_not_zz,
       vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
 
   // We are not interested in results beyond the alt ref itself.
@@ -340,23 +333,16 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     }
   }
 
-  // Only bother with segmentation if over 10% of the MBs in static segment
-  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
-  if (1) {
-    // Note % of blocks that are marked as static
-    if (cm->MBs)
-      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+  // Note % of blocks that are marked as static
+  if (cm->MBs)
+    cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
 
-    // This error case should not be reachable as this function should
-    // never be called with the common data structure uninitialized.
-    else
-      cpi->static_mb_pct = 0;
-
-    vp9_enable_segmentation(&cm->seg);
-  } else {
+  // This error case should not be reachable as this function should
+  // never be called with the common data structure uninitialized.
+  else
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation(&cm->seg);
-  }
+
+  vp9_enable_segmentation(&cm->seg);
 
   // Free localy allocated storage
   vpx_free(arf_not_zz);
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index ac29f36ec1..cbe1c40290 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -77,14 +77,6 @@ int vp9_init_search_range(int size) {
   return sr;
 }
 
-static INLINE int mv_cost(const MV *mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
-  assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
-  return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
-         comp_cost[1][mv->col];
-}
-
 int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
                     int *mvcost[2], int weight) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
@@ -103,15 +95,6 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
   }
   return 0;
 }
-
-static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
-                          int sad_per_bit) {
-  const MV diff = { mv->row - ref->row, mv->col - ref->col };
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
   int len;
   int ss_count = 0;
@@ -159,59 +142,65 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    int64_t tmpmse;                                                          \
-    const MV mv = { r, c };                                                  \
-    const MV ref_mv = { rr, rc };                                            \
-    if (second_pred == NULL) {                                               \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
-                         src_stride, &sse);                                  \
-    } else {                                                                 \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                          src_stride, &sse, second_pred);                    \
-    }                                                                        \
-    tmpmse = thismse;                                                        \
-    tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);     \
-    if (tmpmse >= INT_MAX) {                                                 \
-      v = INT_MAX;                                                           \
-    } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
-  }
+#define CHECK_BETTER(v, r, c)                                                  \
+  do {                                                                         \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+      int64_t tmpmse;                                                          \
+      const MV cb_mv = { r, c };                                               \
+      const MV cb_ref_mv = { rr, rc };                                         \
+      if (second_pred == NULL) {                                               \
+        thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
+                           src_stride, &sse);                                  \
+      } else {                                                                 \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                            src_stride, &sse, second_pred);                    \
+      }                                                                        \
+      tmpmse = thismse;                                                        \
+      tmpmse +=                                                                \
+          mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit);     \
+      if (tmpmse >= INT_MAX) {                                                 \
+        v = INT_MAX;                                                           \
+      } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
+        besterr = v;                                                           \
+        br = r;                                                                \
+        bc = c;                                                                \
+        *distortion = thismse;                                                 \
+        *sse1 = sse;                                                           \
+      }                                                                        \
+    } else {                                                                   \
+      v = INT_MAX;                                                             \
+    }                                                                          \
+  } while (0)
 #else
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    const MV mv = { r, c };                                                  \
-    const MV ref_mv = { rr, rc };                                            \
-    if (second_pred == NULL)                                                 \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
-                         src_stride, &sse);                                  \
-    else                                                                     \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                          src_stride, &sse, second_pred);                    \
-    if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +     \
-             thismse) < besterr) {                                           \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
-  }
+#define CHECK_BETTER(v, r, c)                                                  \
+  do {                                                                         \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+      const MV cb_mv = { r, c };                                               \
+      const MV cb_ref_mv = { rr, rc };                                         \
+      if (second_pred == NULL)                                                 \
+        thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
+                           src_stride, &sse);                                  \
+      else                                                                     \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                            src_stride, &sse, second_pred);                    \
+      if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost,                \
+                           error_per_bit) +                                    \
+               thismse) < besterr) {                                           \
+        besterr = v;                                                           \
+        br = r;                                                                \
+        bc = c;                                                                \
+        *distortion = thismse;                                                 \
+        *sse1 = sse;                                                           \
+      }                                                                        \
+    } else {                                                                   \
+      v = INT_MAX;                                                             \
+    }                                                                          \
+  } while (0)
 
 #endif
 #define FIRST_LEVEL_CHECKS                                       \
-  {                                                              \
+  do {                                                           \
     unsigned int left, right, up, down, diag;                    \
     CHECK_BETTER(left, tr, tc - hstep);                          \
     CHECK_BETTER(right, tr, tc + hstep);                         \
@@ -224,10 +213,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
       case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
       case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
     }                                                            \
-  }
+  } while (0)
 
 #define SECOND_LEVEL_CHECKS                                       \
-  {                                                               \
+  do {                                                            \
     int kr, kc;                                                   \
     unsigned int second;                                          \
     if (tr != br && tc != bc) {                                   \
@@ -256,7 +245,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
         case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
       }                                                           \
     }                                                             \
-  }
+  } while (0)
 
 #define SETUP_SUBPEL_SEARCH                                                 \
   const uint8_t *const z = x->plane[0].src.buf;                             \
@@ -290,7 +279,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   maxr = subpel_mv_limits.row_max;                                          \
                                                                             \
   bestmv->row *= 8;                                                         \
-  bestmv->col *= 8;
+  bestmv->col *= 8
 
 static unsigned int setup_center_error(
     const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
@@ -308,7 +297,7 @@ static unsigned int setup_center_error(
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
@@ -323,7 +312,7 @@ static unsigned int setup_center_error(
   uint32_t besterr;
   (void)xd;
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
@@ -646,7 +635,7 @@ static int accurate_sub_pel_search(
     vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                               0, kernel, MV_PRECISION_Q3, 0, 0);
     if (second_pred != NULL) {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
       besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
     } else {
@@ -665,7 +654,7 @@ static int accurate_sub_pel_search(
   vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                             0, kernel, MV_PRECISION_Q3, 0, 0);
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
     besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
   } else {
@@ -678,48 +667,54 @@ static int accurate_sub_pel_search(
 // TODO(yunqing): this part can be further refactored.
 #if CONFIG_VP9_HIGHBITDEPTH
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                                 \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
-    int64_t tmpmse;                                                            \
-    const MV mv = { r, c };                                                    \
-    const MV ref_mv = { rr, rc };                                              \
-    thismse =                                                                  \
-        accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
-                                y, y_stride, second_pred, w, h, &sse);         \
-    tmpmse = thismse;                                                          \
-    tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);       \
-    if (tmpmse >= INT_MAX) {                                                   \
-      v = INT_MAX;                                                             \
-    } else if ((v = (uint32_t)tmpmse) < besterr) {                             \
-      besterr = v;                                                             \
-      br = r;                                                                  \
-      bc = c;                                                                  \
-      *distortion = thismse;                                                   \
-      *sse1 = sse;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    v = INT_MAX;                                                               \
-  }
+#define CHECK_BETTER1(v, r, c)                                                \
+  do {                                                                        \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+      int64_t tmpmse;                                                         \
+      const MV cb_mv = { r, c };                                              \
+      const MV cb_ref_mv = { rr, rc };                                        \
+      thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
+                                        src_stride, y, y_stride, second_pred, \
+                                        w, h, &sse);                          \
+      tmpmse = thismse;                                                       \
+      tmpmse +=                                                               \
+          mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit);    \
+      if (tmpmse >= INT_MAX) {                                                \
+        v = INT_MAX;                                                          \
+      } else if ((v = (uint32_t)tmpmse) < besterr) {                          \
+        besterr = v;                                                          \
+        br = r;                                                               \
+        bc = c;                                                               \
+        *distortion = thismse;                                                \
+        *sse1 = sse;                                                          \
+      }                                                                       \
+    } else {                                                                  \
+      v = INT_MAX;                                                            \
+    }                                                                         \
+  } while (0)
 #else
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                                 \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
-    const MV mv = { r, c };                                                    \
-    const MV ref_mv = { rr, rc };                                              \
-    thismse =                                                                  \
-        accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
-                                y, y_stride, second_pred, w, h, &sse);         \
-    if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +       \
-             thismse) < besterr) {                                             \
-      besterr = v;                                                             \
-      br = r;                                                                  \
-      bc = c;                                                                  \
-      *distortion = thismse;                                                   \
-      *sse1 = sse;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    v = INT_MAX;                                                               \
-  }
+#define CHECK_BETTER1(v, r, c)                                                \
+  do {                                                                        \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+      const MV cb_mv = { r, c };                                              \
+      const MV cb_ref_mv = { rr, rc };                                        \
+      thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
+                                        src_stride, y, y_stride, second_pred, \
+                                        w, h, &sse);                          \
+      if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost,               \
+                           error_per_bit) +                                   \
+               thismse) < besterr) {                                          \
+        besterr = v;                                                          \
+        br = r;                                                               \
+        bc = c;                                                               \
+        *distortion = thismse;                                                \
+        *sse1 = sse;                                                          \
+      }                                                                       \
+    } else {                                                                  \
+      v = INT_MAX;                                                            \
+    }                                                                         \
+  } while (0)
 
 #endif
 
@@ -958,7 +953,7 @@ static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
   }
 
 #define MAX_PATTERN_SCALES 11
-#define MAX_PATTERN_CANDIDATES 8  // max number of canddiates per scale
+#define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
 #define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
 
 // Calculate and return a sad+mvcost list around an integer best pel.
@@ -972,16 +967,14 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv,
   const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
   int br = best_mv->row;
   int bc = best_mv->col;
-  MV this_mv;
+  const MV mv = { br, bc };
   int i;
   unsigned int sse;
 
-  this_mv.row = br;
-  this_mv.col = bc;
   cost_list[0] =
-      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
+      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
                  in_what->stride, &sse) +
-      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+      mvsad_err_cost(x, &mv, &fcenter_mv, sadpb);
   if (check_bounds(&x->mv_limits, br, bc, 1)) {
     for (i = 0; i < 4; i++) {
       const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
@@ -1041,7 +1034,7 @@ static int vp9_pattern_search(
                      in_what->stride) +
             mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
-  // Search all possible scales upto the search param around the center point
+  // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
   if (do_init_search) {
@@ -1162,6 +1155,9 @@ static int vp9_pattern_search(
     } while (s--);
   }
 
+  best_mv->row = br;
+  best_mv->col = bc;
+
   // Returns the one-away integer pel sad values around the best as follows:
   // cost_list[0]: cost at the best integer pel
   // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
@@ -1169,11 +1165,8 @@ static int vp9_pattern_search(
   // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
   // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
   if (cost_list) {
-    const MV best_mv = { br, bc };
-    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
+    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list);
   }
-  best_mv->row = br;
-  best_mv->col = bc;
   return bestsad;
 }
 
@@ -1215,7 +1208,7 @@ static int vp9_pattern_search_sad(
                      in_what->stride) +
             mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
-  // Search all possible scales upto the search param around the center point
+  // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
   if (do_init_search) {
@@ -1788,29 +1781,6 @@ static int64_t exhaustive_mesh_search_single_step(
   end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
   for (r = start_row; r <= end_row; r += 1) {
     c = start_col;
-    // sdx8f may not be available some block size
-    if (fn_ptr->sdx8f) {
-      while (c + 7 <= end_col) {
-        unsigned int sads[8];
-        const MV mv = { r, c };
-        const uint8_t *buf = get_buf_from_mv(pre, &mv);
-        fn_ptr->sdx8f(src->buf, src->stride, buf, pre->stride, sads);
-
-        for (i = 0; i < 8; ++i) {
-          int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
-          if (sad < best_sad) {
-            const MV mv = { r, c + i };
-            sad += lambda *
-                   vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-        }
-        c += 8;
-      }
-    }
     while (c + 3 <= end_col) {
       unsigned int sads[4];
       const uint8_t *addrs[4];
@@ -2083,9 +2053,9 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row,
 #endif  // CONFIG_NON_GREEDY_MV
 
 int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
-                             MV *ref_mv, MV *best_mv, int search_param,
-                             int sad_per_bit, int *num00,
-                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             MV *ref_mv, uint32_t start_mv_sad, MV *best_mv,
+                             int search_param, int sad_per_bit, int *num00,
+                             const vp9_sad_fn_ptr_t *sad_fn_ptr,
                              const MV *center_mv) {
   int i, j, step;
 
@@ -2096,7 +2066,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   const int in_what_stride = xd->plane[0].pre[0].stride;
   const uint8_t *best_address;
 
-  unsigned int bestsad = INT_MAX;
+  unsigned int bestsad = start_mv_sad;
   int best_site = -1;
   int last_site = -1;
 
@@ -2114,8 +2084,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   const int tot_steps = cfg->total_steps - search_param;
 
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
   ref_row = ref_mv->row;
   ref_col = ref_mv->col;
   *num00 = 0;
@@ -2126,10 +2094,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
   best_address = in_what;
 
-  // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
-            mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
   i = 0;
 
   for (step = 0; step < tot_steps; step++) {
@@ -2153,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
 
         for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
 
-        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
-                       sad_array);
+        sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                           sad_array);
 
         for (t = 0; t < 4; t++, i++) {
           if (sad_array[t] < bestsad) {
@@ -2178,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
         if (is_mv_in(&x->mv_limits, &this_mv)) {
           const uint8_t *const check_here = ss_os[i] + best_address;
           unsigned int thissad =
-              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+              sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -2336,17 +2300,16 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   // TODO(jingning): Implement integral projection functions for high bit-depth
   // setting and remove this part of code.
   if (xd->bd != 8) {
-    unsigned int this_sad;
+    const unsigned int sad = cpi->fn_ptr[bsize].sdf(
+        x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride);
     tmp_mv->row = 0;
     tmp_mv->col = 0;
-    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                      xd->plane[0].pre[0].buf, ref_stride);
 
     if (scaled_ref_frame) {
       int i;
       for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
     }
-    return this_sad;
+    return sad;
   }
 #endif
 
@@ -2521,15 +2484,54 @@ int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
               point as the best match, we will do a final 1-away diamond
               refining search  */
 static int full_pixel_diamond(const VP9_COMP *const cpi,
-                              const MACROBLOCK *const x, MV *mvp_full,
-                              int step_param, int sadpb, int further_steps,
-                              int do_refine, int *cost_list,
+                              const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                              MV *mvp_full, int step_param, int sadpb,
+                              int further_steps, int do_refine,
+                              int use_downsampled_sad, int *cost_list,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
-  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param, sadpb, &n, fn_ptr, ref_mv);
+  int bestsme;
+  const int src_buf_stride = x->plane[0].src.stride;
+  const uint8_t *const src_buf = x->plane[0].src.buf;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pred_buf_stride = xd->plane[0].pre[0].stride;
+  uint8_t *pred_buf;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
+  unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows;
+  const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+
+  pred_buf =
+      xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+  start_mv_sad_even_rows =
+      fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+  start_mv_sad_odd_rows =
+      fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride,
+                   pred_buf + pred_buf_stride, pred_buf_stride);
+  start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1;
+  start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb);
+
+  sad_fn_ptr.sdf = fn_ptr->sdf;
+  sad_fn_ptr.sdx4df = fn_ptr->sdx4df;
+  if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) {
+    // If the absolute difference between the pred-to-src SAD of even rows and
+    // the pred-to-src SAD of odd rows is small, skip every other row in sad
+    // computation.
+    const int odd_to_even_diff_sad =
+        abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+    const int mult_thresh = 10;
+    if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+      sad_fn_ptr.sdf = fn_ptr->sdsf;
+      sad_fn_ptr.sdx4df = fn_ptr->sdsx4df;
+    }
+  }
+
+  bestsme =
+      cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv,
+                              step_param, sadpb, &n, &sad_fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
   *dst_mv = temp_mv;
@@ -2544,9 +2546,9 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param + n, sadpb, &num00, fn_ptr,
-                                        ref_mv);
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad,
+                                        &temp_mv, step_param + n, sadpb, &num00,
+                                        &sad_fn_ptr, ref_mv);
       if (thissme < INT_MAX)
         thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
 
@@ -2564,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
   if (do_refine) {
     const int search_range = 8;
     MV best_mv = *dst_mv;
-    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
-                                      ref_mv);
+    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
+                                      &sad_fn_ptr, ref_mv);
     if (thissme < INT_MAX)
       thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
@@ -2574,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
     }
   }
 
+  if (sad_fn_ptr.sdf != fn_ptr->sdf) {
+    // If we are skipping rows when we perform the motion search, we need to
+    // check the quality of skipping. If it's bad, then we run search with
+    // skip row features off.
+    const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv);
+    const int sad =
+        fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    const int skip_sad =
+        fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    // We will keep the result of skipping rows if it's good enough.
+    const int kSADThresh =
+        1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) {
+      // There is a large discrepancy between skipping and not skipping, so we
+      // need to redo the motion search.
+      return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb,
+                                further_steps, do_refine, 0, cost_list, fn_ptr,
+                                ref_mv, dst_mv);
+    }
+  }
+
   // Return cost list.
   if (cost_list) {
     calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
@@ -2725,7 +2748,7 @@ int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
 
 int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const vp9_sad_fn_ptr_t *sad_fn_ptr,
                             const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
@@ -2734,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
   const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
   unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+      sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -2751,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                                             best_address - 1, best_address + 1,
                                             best_address + in_what->stride };
 
-      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+      sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride,
+                         sads);
 
       for (j = 0; j < 4; ++j) {
         if (sads[j] < best_sad) {
@@ -2771,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 
         if (is_mv_in(&x->mv_limits, &mv)) {
           unsigned int sad =
-              fn_ptr->sdf(what->buf, what->stride,
-                          get_buf_from_mv(in_what, &mv), in_what->stride);
+              sad_fn_ptr->sdf(what->buf, what->stride,
+                              get_buf_from_mv(in_what, &mv), in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
             if (sad < best_sad) {
@@ -2889,9 +2913,10 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
       break;
     case NSTEP:
     case MESH:
-      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                               MAX_MVSEARCH_STEPS - 1 - step_param, 1,
-                               cost_list, fn_ptr, ref_mv, tmp_mv);
+      var = full_pixel_diamond(
+          cpi, x, bsize, mvp_full, step_param, error_per_bit,
+          MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+          cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv);
       break;
     default: assert(0 && "Unknown search method");
   }
@@ -2962,7 +2987,7 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
   (void)sse;           \
   (void)thismse;       \
   (void)cost_list;     \
-  (void)use_accurate_subpel_search;
+  (void)use_accurate_subpel_search
 
 // Return the maximum MV.
 uint32_t vp9_return_max_sub_pixel_mv(
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 0c4d8f23c6..fd6a8b9aca 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -41,6 +41,11 @@ typedef struct search_site_config {
   int total_steps;
 } search_site_config;
 
+typedef struct vp9_sad_table {
+  vpx_sad_fn_t sdf;
+  vpx_sad_multi_d_fn_t sdx4df;
+} vp9_sad_fn_ptr_t;
+
 static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
                                              const MV *mv) {
   return &buf->buf[mv->row * buf->stride + mv->col];
@@ -63,12 +68,13 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
 
 struct VP9_COMP;
 struct SPEED_FEATURES;
+struct vp9_sad_table;
 
 int vp9_init_search_range(int size);
 
 int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
                             int error_per_bit, int search_range,
-                            const struct vp9_variance_vtable *fn_ptr,
+                            const struct vp9_sad_table *sad_fn_ptr,
                             const struct mv *center_mv);
 
 // Perform integral projection based motion estimation.
@@ -93,20 +99,10 @@ extern fractional_mv_step_fp vp9_skip_sub_pixel_tree;
 extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
 extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
 
-typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
-                                    int sad_per_bit, int distance,
-                                    const vp9_variance_fn_ptr_t *fn_ptr,
-                                    const MV *center_mv, MV *best_mv);
-
-typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv,
-                                        int sad_per_bit, int distance,
-                                        const vp9_variance_fn_ptr_t *fn_ptr,
-                                        const MV *center_mv);
-
 typedef int (*vp9_diamond_search_fn_t)(
-    const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
-    int search_param, int sad_per_bit, int *num00,
-    const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+    const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv,
+    uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit,
+    int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv);
 
 int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                              int search_range,
diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
index c66c035492..0843cd97e4 100644
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -36,7 +36,7 @@ void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
   pthread_mutex_lock(mutex_handle);
 #endif
   next = job_queue_hdl->next;
-  if (NULL != next) {
+  if (next != NULL) {
     JobQueue *job_queue = (JobQueue *)next;
     job_info = &job_queue->job_info;
     // Update the next job in the queue
@@ -58,9 +58,10 @@ void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
       (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1;
   int i;
 
-  this_tile->row_base_thresh_freq_fact =
+  CHECK_MEM_ERROR(
+      &cm->error, this_tile->row_base_thresh_freq_fact,
       (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
-                        sizeof(*(this_tile->row_base_thresh_freq_fact)));
+                        sizeof(*(this_tile->row_base_thresh_freq_fact))));
   for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
     this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
 }
@@ -84,8 +85,8 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
   multi_thread_ctxt->allocated_tile_rows = tile_rows;
   multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
 
-  multi_thread_ctxt->job_queue =
-      (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue));
+  CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue,
+                  (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
 
 #if CONFIG_MULTITHREAD
   // Create mutex for each tile
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index 9696529c50..4ee6e51ba8 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -202,7 +202,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
                      VPXMIN(cpi->consec_zero_mv[bl_index1],
                             VPXMIN(cpi->consec_zero_mv[bl_index2],
                                    cpi->consec_zero_mv[bl_index3])));
-          // Only consider blocks that are likely steady background. i.e, have
+          // Only consider blocks that are likely steady background. i.e., have
           // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
           // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
           // 4 sub-blocks for 16x16 block. And exclude this frame if
diff --git a/vp9/encoder/vp9_non_greedy_mv.c b/vp9/encoder/vp9_non_greedy_mv.c
index 4679d6c49c..d52801c845 100644
--- a/vp9/encoder/vp9_non_greedy_mv.c
+++ b/vp9/encoder/vp9_non_greedy_mv.c
@@ -178,6 +178,7 @@ Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info,
   motion_field_info->frame_num = frame_num;
   motion_field_info->motion_field_array =
       vpx_calloc(frame_num, sizeof(*motion_field_info->motion_field_array));
+  if (!motion_field_info->motion_field_array) return STATUS_FAILED;
   for (frame_idx = 0; frame_idx < frame_num; ++frame_idx) {
     for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
       for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
@@ -422,6 +423,7 @@ void vp9_get_smooth_motion_field(const MV *search_mf,
   int row, col;
   int bw = 4 << b_width_log2_lookup[bsize];
   int bh = 4 << b_height_log2_lookup[bsize];
+  if (!(input && output)) goto fail;
   // copy search results to input buffer
   for (idx = 0; idx < rows * cols; ++idx) {
     input[idx].row = (float)search_mf[idx].row / bh;
@@ -450,6 +452,7 @@ void vp9_get_smooth_motion_field(const MV *search_mf,
     smooth_mf[idx].row = (int)(input[idx].row * bh);
     smooth_mf[idx].col = (int)(input[idx].col * bw);
   }
+fail:
   free(input);
   free(output);
 }
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 695fd484fc..6f2524b36e 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -268,6 +268,7 @@ static void block_variance(const uint8_t *src, int src_stride,
 #endif
                            uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
   int i, j, k = 0;
+  uint32_t k_sqr = 0;
 
   *sse = 0;
   *sum = 0;
@@ -305,7 +306,8 @@ static void block_variance(const uint8_t *src, int src_stride,
 #endif
       *sse += sse8x8[k];
       *sum += sum8x8[k];
-      var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k];
       k++;
     }
   }
@@ -319,6 +321,7 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
   const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
   const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
   int i, j, k = 0;
+  uint32_t k_sqr = 0;
 
   for (i = 0; i < nh; i += 2) {
     for (j = 0; j < nw; j += 2) {
@@ -326,9 +329,10 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
                  sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
       sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
                  sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
-      var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
-                                       (b_width_log2_lookup[unit_size] +
-                                        b_height_log2_lookup[unit_size] + 6));
+      k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+                         (b_width_log2_lookup[unit_size] +
+                          b_height_log2_lookup[unit_size] + 6));
+      var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k];
       k++;
     }
   }
@@ -452,6 +456,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
   unsigned int var8x8[64] = { 0 };
   TX_SIZE tx_size;
   int i, k;
+  uint32_t sum_sqr;
 #if CONFIG_VP9_HIGHBITDEPTH
   const vpx_bit_depth_t bd = cpi->common.bit_depth;
 #endif
@@ -463,7 +468,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                  cpi->common.use_highbitdepth, bd,
 #endif
                  sse8x8, sum8x8, var8x8);
-  var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+  sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4);
+  var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse;
 
   *var_y = var;
   *sse_y = sse;
@@ -560,23 +566,26 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
     // Transform skipping test in UV planes.
     for (i = 1; i <= 2; i++) {
-      struct macroblock_plane *const p = &x->plane[i];
-      struct macroblockd_plane *const pd = &xd->plane[i];
-      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
+      struct macroblock_plane *const p_uv = &x->plane[i];
+      struct macroblockd_plane *const pd_uv = &xd->plane[i];
+      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv);
       const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
-      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv);
       const int uv_bw = b_width_log2_lookup[uv_bsize];
       const int uv_bh = b_height_log2_lookup[uv_bsize];
       const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
                      (uv_bh - b_height_log2_lookup[unit_size]);
-      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
-      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+      const uint32_t uv_dc_thr =
+          pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf);
+      const uint32_t uv_ac_thr =
+          pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf);
       int j = i - 1;
 
       vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
       flag_preduv_computed[i - 1] = 1;
-      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
-          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride,
+                                           pd_uv->dst.buf, pd_uv->dst.stride,
+                                           &sse_uv[j]);
 
       if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
           (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
@@ -762,7 +771,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
   for (r = 0; r < max_blocks_high; r += block_step) {
     for (c = 0; c < num_4x4_w; c += block_step) {
       if (c < max_blocks_wide) {
-        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+        const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
         tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
         tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
         tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -771,25 +780,25 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
         const int16_t *src_diff;
         src_diff = &p->src_diff[(r * diff_stride + c) << 2];
 
+        // skip block condition should be handled before this is called.
+        assert(!x->skip_block);
+
         switch (tx_size) {
           case TX_16X16:
             vpx_hadamard_16x16(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp,
-                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
           case TX_8X8:
             vpx_hadamard_8x8(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp,
-                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
           default:
             assert(tx_size == TX_4X4);
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-            vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
-                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
         }
         *skippable &= (*eob == 0);
@@ -1109,7 +1118,7 @@ static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh,
 }
 
 static INLINE void update_thresh_freq_fact_row_mt(
-    VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
+    VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance,
     int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame,
     THR_MODES best_mode_idx, PREDICTION_MODE mode) {
   THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
@@ -1247,7 +1256,7 @@ static INLINE void find_predictors(
     VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
     int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
-    const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col,
+    TileDataEnc *tile_data, int mi_row, int mi_col,
     struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
     int force_skip_low_temp_var, int comp_pred_allowed) {
   VP9_COMMON *const cm = &cpi->common;
@@ -1259,7 +1268,7 @@ static INLINE void find_predictors(
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   frame_mv[ZEROMV][ref_frame].as_int = 0;
   // this needs various further optimizations. to be continued..
-  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+  if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) {
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
@@ -1386,8 +1395,8 @@ static void recheck_zeromv_after_denoising(
     RD_COST this_rdc;
     mi->mode = ZEROMV;
     mi->ref_frame[0] = LAST_FRAME;
-    mi->ref_frame[1] = NONE;
-    set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+    mi->ref_frame[1] = NO_REF_FRAME;
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
     mi->mv[0].as_int = 0;
     mi->interp_filter = EIGHTTAP;
     if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR;
@@ -1405,7 +1414,7 @@ static void recheck_zeromv_after_denoising(
       this_rdc = *best_rdc;
       mi->mode = ctx_den->best_mode;
       mi->ref_frame[0] = ctx_den->best_ref_frame;
-      set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+      set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
       mi->interp_filter = ctx_den->best_pred_filter;
       if (ctx_den->best_ref_frame == INTRA_FRAME) {
         mi->mv[0].as_int = INVALID_MV;
@@ -1624,9 +1633,9 @@ static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x,
         return -1;
 
       // Exit NEWMV search if base_mv_sse is large.
-      if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+      if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar)
         return -1;
-      if (base_mv_sse < (best_sse_sofar << 1)) {
+      if ((base_mv_sse >> 1) < best_sse_sofar) {
         // Base layer mv is good.
         // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
         // (0, 0) mode is already tested.
@@ -1669,7 +1678,7 @@ static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
   bp->best_intra_tx_size = TX_SIZES;
   bp->best_pred_filter = EIGHTTAP;
   bp->best_mode_skip_txfm = SKIP_TXFM_NONE;
-  bp->best_second_ref_frame = NONE;
+  bp->best_second_ref_frame = NO_REF_FRAME;
   bp->best_pred = NULL;
 }
 
@@ -1690,8 +1699,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   RD_COST this_rdc, best_rdc;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
@@ -1865,8 +1872,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   vp9_rd_cost_reset(&best_rdc);
   vp9_rd_cost_reset(rd_cost);
   mi->sb_type = bsize;
-  mi->ref_frame[0] = NONE;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[0] = NO_REF_FRAME;
+  mi->ref_frame[1] = NO_REF_FRAME;
 
   mi->tx_size =
       VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]);
@@ -1925,16 +1932,16 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
       svc->spatial_layer_id > 0 && !gf_temporal_ref) {
-    if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
-      struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) {
+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+      struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+      if (vp9_is_scaled(ref_sf)) {
         svc_force_zero_mode[LAST_FRAME - 1] = 1;
         inter_layer_ref = LAST_FRAME;
       }
     }
-    if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
-      struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) {
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+      struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+      if (vp9_is_scaled(ref_sf)) {
         svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
         inter_layer_ref = GOLDEN_FRAME;
       }
@@ -1957,7 +1964,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                                  cpi->rc.avg_frame_low_motion < 60))
     usable_ref_frame = LAST_FRAME;
 
-  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+  if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
         !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
     use_golden_nonzeromv = 0;
 
@@ -1985,12 +1992,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     // Skip find_predictor if the reference frame is not in the
     // ref_frame_flags (i.e., not used as a reference for this frame).
     skip_ref_find_pred[ref_frame] =
-        !(cpi->ref_frame_flags & flag_list[ref_frame]);
+        !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame));
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
-                      &ref_frame_skip_mask, flag_list, tile_data, mi_row,
-                      mi_col, yv12_mb, bsize, force_skip_low_temp_var,
-                      comp_modes > 0);
+                      &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb,
+                      bsize, force_skip_low_temp_var, comp_modes > 0);
     }
   }
 
@@ -2014,7 +2020,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // than current layer: force check of GF-ZEROMV before early exit
   // due to skip flag.
   if (svc->spatial_layer_id > 0 && no_scaling &&
-      (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+      (cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
       cm->base_qindex > svc->lower_layer_qindex + 10)
     force_test_gf_zeromv = 1;
 
@@ -2045,7 +2051,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     int comp_pred = 0;
     int force_mv_inter_layer = 0;
     PREDICTION_MODE this_mode;
-    second_ref_frame = NONE;
+    second_ref_frame = NO_REF_FRAME;
 
     if (idx < num_inter_modes) {
       this_mode = ref_mode_set[idx].pred_mode;
@@ -2094,7 +2100,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
@@ -2107,7 +2114,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
          (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden)))
       continue;
 
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue;
 
     // For screen content. If zero_temp_sad source is computed: skip
     // non-zero motion check for stationary blocks. If the superblock is
@@ -2190,7 +2197,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         if (usable_ref_frame < ALTREF_FRAME) {
           if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
             i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-            if ((cpi->ref_frame_flags & flag_list[i]))
+            if ((cpi->ref_frame_flags & ref_frame_to_flag(i)))
               if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
                 ref_frame_skip_mask |= (1 << ref_frame);
           }
@@ -2199,9 +2206,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                      ref_frame == ALTREF_FRAME)) {
           int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
           int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-          if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+          if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-              ((cpi->ref_frame_flags & flag_list[ref2]) &&
+              ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
             ref_frame_skip_mask |= (1 << ref_frame);
         }
@@ -2488,7 +2495,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     perform_intra_pred =
         svc->temporal_layer_id == 0 ||
         svc->layer_context[svc->temporal_layer_id].is_key_frame ||
-        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
+        !(cpi->ref_frame_flags & VP9_GOLD_FLAG) ||
         (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
          svc_force_zero_mode[best_pickmode.best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
@@ -2621,7 +2628,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         best_pickmode.best_mode = this_mode;
         best_pickmode.best_intra_tx_size = mi->tx_size;
         best_pickmode.best_ref_frame = INTRA_FRAME;
-        best_pickmode.best_second_ref_frame = NONE;
+        best_pickmode.best_second_ref_frame = NO_REF_FRAME;
         mi->uv_mode = this_mode;
         mi->mv[0].as_int = INVALID_MV;
         mi->mv[1].as_int = INVALID_MV;
@@ -2743,12 +2750,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
-  MV_REFERENCE_FRAME best_ref_frame = NONE;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NO_REF_FRAME;
+  MV_REFERENCE_FRAME best_ref_frame = NO_REF_FRAME;
   unsigned char segment_id = mi->segment_id;
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   b_mode_info bsi[MAX_REF_FRAMES][4];
   int ref_frame_skip_mask = 0;
@@ -2764,11 +2769,13 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
     int_mv dummy_mv[2];
     x->pred_mv_sad[ref_frame] = INT_MAX;
 
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
+        (yv12 != NULL)) {
       int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
-      const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
-      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf,
-                           sf);
+      const struct scale_factors *const ref_sf =
+          &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf,
+                           ref_sf);
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
                        mbmi_ext->mode_context);
 
@@ -2783,7 +2790,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   mi->tx_size = TX_4X4;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->interp_filter =
       cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
 
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index c996b75167..19edf166d3 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -15,6 +15,7 @@
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -22,15 +23,14 @@
 #include "vp9/encoder/vp9_rd.h"
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       int skip_block, const int16_t *round_ptr,
-                       const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                       tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                       uint16_t *eob_ptr, const int16_t *scan,
-                       const int16_t *iscan) {
+                       const struct macroblock_plane *const mb_plane,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -56,17 +56,15 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t *round_ptr,
-                              const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                              tran_low_t *dqcoeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct ScanOrder *const scan_order) {
   int i;
   int eob = -1;
-
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -91,15 +89,14 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *round_ptr,
-                             const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                             tran_low_t *dqcoeff_ptr,
+                             const struct macroblock_plane *const mb_plane,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+                             const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -126,15 +123,14 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -159,35 +155,6 @@ void vp9_highbd_quantize_fp_32x32_c(
 }
 #endif
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block),
-             *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int n_coeffs = 4 * 4;
-
-  if (x->skip_block) {
-    memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff));
-    memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff));
-    return;
-  }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs,
-                          x->skip_block, p->zbin, p->round, p->quant,
-                          p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                          &p->eobs[block], scan, iscan);
-    return;
-  }
-#endif
-  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, x->skip_block,
-                 p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
-                 pd->dequant, &p->eobs[block], scan, iscan);
-}
-
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
   int l, m;
@@ -288,8 +255,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   // Y
   x->plane[0].quant = quants->y_quant[qindex];
   x->plane[0].quant_fp = quants->y_quant_fp[qindex];
-  memcpy(x->plane[0].round_fp, quants->y_round_fp[qindex],
-         8 * sizeof(*(x->plane[0].round_fp)));
+  x->plane[0].round_fp = quants->y_round_fp[qindex];
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
@@ -301,8 +267,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   for (i = 1; i < 3; i++) {
     x->plane[i].quant = quants->uv_quant[qindex];
     x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
-    memcpy(x->plane[i].round_fp, quants->uv_round_fp[qindex],
-           8 * sizeof(*(x->plane[i].round_fp)));
+    x->plane[i].round_fp = quants->uv_round_fp[qindex];
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 2e6d7da2b6..f626f06566 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -37,9 +37,6 @@ typedef struct {
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan);
-
 struct VP9_COMP;
 struct VP9Common;
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index bfd4e39431..6452e349df 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -39,9 +39,6 @@
 #define MAX_MB_RATE 250
 #define MAXRATE_1080P 4000000
 
-#define DEFAULT_KF_BOOST 2000
-#define DEFAULT_GF_BOOST 2000
-
 #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
 
 #define MIN_BPB_FACTOR 0.005
@@ -226,9 +223,10 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
 
   if (oxcf->rc_max_inter_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
-    target = VPXMIN(target, max_rate);
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    // target is of type int and VPXMIN cannot evaluate to larger than target
+    target = (int)VPXMIN(target, max_rate);
   }
   return target;
 }
@@ -237,9 +235,9 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   if (oxcf->rc_max_intra_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
-    target = VPXMIN(target, max_rate);
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+    target = (int)VPXMIN(target, max_rate);
   }
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
   return target;
@@ -262,7 +260,7 @@ void vp9_update_buffer_level_preencode(VP9_COMP *cpi) {
 // for the layered rate control which involves cumulative buffer levels for
 // the temporal layers. Allow for using the timestamp(pts) delta for the
 // framerate when the set_ref_frame_config is used.
-static void update_buffer_level_svc_preencode(VP9_COMP *cpi) {
+void vp9_update_buffer_level_svc_preencode(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
   int i;
   // Set this to 1 to use timestamp delta for "framerate" under
@@ -280,9 +278,9 @@ static void update_buffer_level_svc_preencode(VP9_COMP *cpi) {
         svc->current_superframe > 0) {
       // TODO(marpan): This may need to be modified for temporal layers.
       const double framerate_pts = 10000000.0 / ts_delta;
-      lrc->bits_off_target += (int)(lc->target_bandwidth / framerate_pts);
+      lrc->bits_off_target += (int)round(lc->target_bandwidth / framerate_pts);
     } else {
-      lrc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
+      lrc->bits_off_target += (int)round(lc->target_bandwidth / lc->framerate);
     }
     // Clip buffer level to maximum buffer size for the layer.
     lrc->bits_off_target =
@@ -329,7 +327,7 @@ static void update_buffer_level_postencode(VP9_COMP *cpi,
 
   rc->buffer_level = rc->bits_off_target;
 
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size);
   }
 }
@@ -410,6 +408,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->source_alt_ref_active = 0;
 
   rc->frames_till_gf_update_due = 0;
+  rc->constrain_gf_key_freq_onepass_vbr = 1;
   rc->ni_av_qi = oxcf->worst_allowed_q;
   rc->ni_tot_qi = 0;
   rc->ni_frames = 0;
@@ -441,6 +440,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->last_post_encode_dropped_scene_change = 0;
   rc->use_post_encode_drop = 0;
   rc->ext_use_post_encode_drop = 0;
+  rc->disable_overshoot_maxq_cbr = 0;
   rc->arf_active_best_quality_adjustment_factor = 1.0;
   rc->arf_increase_active_best_quality = 0;
   rc->preserve_arf_as_gld = 0;
@@ -680,7 +680,8 @@ static int adjust_q_cbr(const VP9_COMP *cpi, int q) {
     else
       q = qclamp;
   }
-  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     vp9_cyclic_refresh_limit_q(cpi, &q);
   return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
 }
@@ -910,7 +911,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
     active_worst_quality =
         curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1;
   } else {
-    if (!rc->is_src_frame_alt_ref &&
+    if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
       active_worst_quality =
           curr_frame == 1
@@ -1150,8 +1151,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   if (frame_is_intra_only(cm)) {
     if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
-      int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex =
+          vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else if (rc->this_key_frame_forced) {
       // Handle the special case for key frames forced when we have reached
@@ -1195,7 +1197,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
     } else {
       q = rc->avg_frame_qindex[KEY_FRAME];
     }
-    // For constrained quality dont allow Q less than the cq level
+    // For constrained quality don't allow Q less than the cq level
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
 
@@ -1206,12 +1208,14 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
 
     } else if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex;
       if (cpi->refresh_alt_ref_frame)
-        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+        delta_qindex =
+            vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth);
       else
-        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+        delta_qindex =
+            vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
@@ -1219,11 +1223,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   } else {
     if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
                                                0.70, 1.0, 0.85, 1.0 };
       int delta_qindex = vp9_compute_qdelta(
-          rc, q, q * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+          rc, qstart,
+          qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
           cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
@@ -1355,7 +1360,7 @@ static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index,
       active_best_quality /= 4;
     }
 
-    // Dont allow the active min to be lossless (q0) unlesss the max q
+    // Don't allow the active min to be lossless (q0) unlesss the max q
     // already indicates lossless.
     active_best_quality =
         VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality));
@@ -1453,7 +1458,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
     } else {
       q = active_worst_quality;
     }
-    // For constrained quality dont allow Q less than the cq level
+    // For constrained quality don't allow Q less than the cq level
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
     }
@@ -1713,9 +1718,18 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
 
   // Modify frame size target when down-scaling.
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
-      rc->frame_size_selector != UNSCALED)
+      rc->frame_size_selector != UNSCALED) {
     rc->this_frame_target = (int)(rc->this_frame_target *
                                   rate_thresh_mult[rc->frame_size_selector]);
+  }
+
+#if CONFIG_RATE_CTRL
+  if (cpi->oxcf.use_simple_encode_api) {
+    if (cpi->encode_command.use_external_target_frame_bits) {
+      rc->this_frame_target = cpi->encode_command.target_frame_bits;
+    }
+  }
+#endif  // CONFIG_RATE_CTRL
 
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate = (int)(((int64_t)rc->this_frame_target * 64 * 64) /
@@ -1850,8 +1864,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
     if (cpi->use_svc) {
-      int i = 0;
-      SVC *svc = &cpi->svc;
+      int i;
       for (i = 0; i < svc->number_temporal_layers; ++i) {
         const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
                                            svc->number_temporal_layers);
@@ -1862,7 +1875,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
       }
     }
   } else {
-    if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) ||
+    if ((cpi->use_svc) ||
         (!rc->is_src_frame_alt_ref &&
          !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
       rc->last_q[INTER_FRAME] = qindex;
@@ -1979,6 +1992,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
   if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1)
     svc->lower_layer_qindex = cm->base_qindex;
+  cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1988,6 +2002,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
   cpi->rc.rc_2_frame = 0;
   cpi->rc.rc_1_frame = 0;
   cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
+  cpi->rc.last_q[INTER_FRAME] = cpi->common.base_qindex;
   // For SVC on dropped frame when framedrop_mode != LAYER_DROP:
   // in this mode the whole superframe may be dropped if only a single layer
   // has buffer underflow (below threshold). Since this can then lead to
@@ -1998,9 +2013,10 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
     cpi->rc.buffer_level = cpi->rc.optimal_buffer_level;
     cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level;
   }
+  cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
 }
 
-static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const int af_ratio = rc->af_ratio_onepass_vbr;
   int64_t target =
@@ -2011,14 +2027,23 @@ static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
                 (rc->baseline_gf_interval + af_ratio - 1)
           : ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
                 (rc->baseline_gf_interval + af_ratio - 1);
+  // For SVC: refresh flags are used to define the pattern, so we can't
+  // use that for boosting the target size here.
+  // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC.
+  // For now just use the CBR logic for setting target size.
+  if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
   if (target > INT_MAX) target = INT_MAX;
   return vp9_rc_clamp_pframe_target_size(cpi, (int)target);
 }
 
-static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   static const int kf_ratio = 25;
   const RATE_CONTROL *rc = &cpi->rc;
-  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  int target = rc->avg_frame_bandwidth;
+  if (target > INT_MAX / kf_ratio)
+    target = INT_MAX;
+  else
+    target = rc->avg_frame_bandwidth * kf_ratio;
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
@@ -2041,22 +2066,9 @@ static void adjust_gfint_frame_constraint(VP9_COMP *cpi, int frame_constraint) {
   }
 }
 
-void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
+void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  int target;
-  if (!cpi->refresh_alt_ref_frame &&
-      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0)) {
-    cm->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
-        cm->current_video_frame != 0 && rc->frames_to_key == 0;
-    rc->frames_to_key = cpi->oxcf.key_freq;
-    rc->kf_boost = DEFAULT_KF_BOOST;
-    rc->source_alt_ref_active = 0;
-  } else {
-    cm->frame_type = INTER_FRAME;
-  }
+  VP9_COMMON *const cm = &cpi->common;
   if (rc->frames_till_gf_update_due == 0) {
     double rate_err = 1.0;
     rc->gfu_boost = DEFAULT_GF_BOOST;
@@ -2075,18 +2087,23 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
           rate_err > 3.5) {
         rc->baseline_gf_interval =
             VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
-      } else if (rc->avg_frame_low_motion < 20) {
+      } else if (rc->avg_frame_low_motion > 0 &&
+                 rc->avg_frame_low_motion < 20) {
         // Decrease gf interval for high motion case.
         rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1);
       }
-      // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
-      // between 0 and 100 (stationary, 100% zero/small motion).
-      rc->gfu_boost =
-          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                          (rc->avg_frame_low_motion + 100));
+      // Adjust boost and af_ratio based on avg_frame_low_motion, which
+      // varies between 0 and 100 (stationary, 100% zero/small motion).
+      if (rc->avg_frame_low_motion > 0)
+        rc->gfu_boost =
+            VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                            (rc->avg_frame_low_motion + 100));
+      else if (rc->avg_frame_low_motion == 0 && rate_err > 1.0)
+        rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
-    adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
+    if (rc->constrain_gf_key_freq_onepass_vbr)
+      adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = 0;
@@ -2096,10 +2113,30 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
       rc->alt_ref_gf_group = 1;
     }
   }
+}
+
+void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 ||
+       (cpi->oxcf.mode != cpi->deadline_mode_previous_frame))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_video_frame != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  vp9_set_gf_update_one_pass_vbr(cpi);
   if (cm->frame_type == KEY_FRAME)
-    target = calc_iframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
   else
-    target = calc_pframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
   vp9_rc_set_frame_target(cpi, target);
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0)
     vp9_cyclic_refresh_update_parameters(cpi);
@@ -2126,7 +2163,7 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   } else {
     target = rc->avg_frame_bandwidth;
   }
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -2139,12 +2176,12 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   if (diff > 0) {
     // Lower the target bandwidth for this frame.
     const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
-    target -= (target * pct_low) / 200;
+    target -= (int)(((int64_t)target * pct_low) / 200);
   } else if (diff < 0) {
     // Increase the target bandwidth for this frame.
     const int pct_high =
         (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
-    target += (target * pct_high) / 200;
+    target += (int)(((int64_t)target * pct_high) / 200);
   }
   if (oxcf->rc_max_inter_bitrate_pct) {
     const int max_rate =
@@ -2193,7 +2230,6 @@ static void set_intra_only_frame(VP9_COMP *cpi) {
   // only 3 reference buffers can be updated, but for temporal layers > 1
   // we generally need to use buffer slots 4 and 5.
   if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) ||
-      svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS ||
       svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 ||
       svc->number_spatial_layers == 1)
     return;
@@ -2214,11 +2250,15 @@ static void set_intra_only_frame(VP9_COMP *cpi) {
     cpi->lst_fb_idx = -1;
     cpi->gld_fb_idx = -1;
     cpi->alt_fb_idx = -1;
+    svc->update_buffer_slot[0] = 0;
     // For intra-only frame we need to refresh all slots that were
     // being used for the base layer (fb_idx_base[i] == 1).
     // Start with assigning last first, then golden and then alt.
     for (i = 0; i < REF_FRAMES; ++i) {
-      if (svc->fb_idx_base[i] == 1) count++;
+      if (svc->fb_idx_base[i] == 1) {
+        svc->update_buffer_slot[0] |= 1 << i;
+        count++;
+      }
       if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i;
       if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i;
       if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i;
@@ -2227,6 +2267,12 @@ static void set_intra_only_frame(VP9_COMP *cpi) {
     // to the lst_fb_idx.
     if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx;
     if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx;
+    if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      cpi->ext_refresh_last_frame = 0;
+      cpi->ext_refresh_golden_frame = 0;
+      cpi->ext_refresh_alt_ref_frame = 0;
+      cpi->ref_frame_flags = 0;
+    }
   }
 }
 
@@ -2242,17 +2288,18 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   // Periodic key frames is based on the super-frame counter
   // (svc.current_superframe), also only base spatial layer is key frame.
   // Key frame is set for any of the following: very first frame, frame flags
-  // indicates key, superframe counter hits key frequency, or (non-intra) sync
-  // flag is set for spatial layer 0.
+  // indicates key, superframe counter hits key frequency,(non-intra) sync
+  // flag is set for spatial layer 0, or deadline mode changes.
   if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) ||
       (cpi->frame_flags & FRAMEFLAGS_KEY) ||
       (cpi->oxcf.auto_key &&
        (svc->current_superframe % cpi->oxcf.key_freq == 0) &&
        !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) ||
-      (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) {
+      (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0) ||
+      (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
-    if (is_one_pass_cbr_svc(cpi)) {
+    if (is_one_pass_svc(cpi)) {
       if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1);
       layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
                                svc->number_temporal_layers);
@@ -2260,11 +2307,14 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
       // Assumption here is that LAST_FRAME is being updated for a keyframe.
       // Thus no change in update flags.
-      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR)
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      else
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
     }
   } else {
     cm->frame_type = INTER_FRAME;
-    if (is_one_pass_cbr_svc(cpi)) {
+    if (is_one_pass_svc(cpi)) {
       LAYER_CONTEXT *lc = &svc->layer_context[layer];
       // Add condition current_video_frame > 0 for the case where first frame
       // is intra only followed by overlay/copy frame. In this case we don't
@@ -2273,7 +2323,23 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
           (svc->spatial_layer_id == 0 && cm->current_video_frame > 0)
               ? 0
               : svc->layer_context[svc->temporal_layer_id].is_key_frame;
-      target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR) {
+        target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+      } else {
+        double rate_err = 0.0;
+        rc->fac_active_worst_inter = 140;
+        rc->fac_active_worst_gf = 100;
+        if (rc->rolling_target_bits > 0) {
+          rate_err =
+              (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+          if (rate_err < 1.0)
+            rc->fac_active_worst_inter = 120;
+          else if (rate_err > 2.0)
+            // Increase active_worst faster if rate fluctuation is high.
+            rc->fac_active_worst_inter = 160;
+        }
+        target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
+      }
     }
   }
 
@@ -2282,7 +2348,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
         svc->layer_context[layer].is_key_frame == 1) {
       cm->frame_type = KEY_FRAME;
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR)
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      else
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
     }
     // Set the buffer idx and refresh flags for key frames in simulcast mode.
     // Note the buffer slot for long-term reference is set below (line 2255),
@@ -2367,15 +2436,21 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   }
   if (svc->set_intra_only_frame) {
     set_intra_only_frame(cpi);
-    target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+    if (cpi->oxcf.rc_mode == VPX_CBR)
+      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+    else
+      target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
   }
+  // Overlay frame predicts from LAST (intra-only)
+  if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG;
+
   // Any update/change of global cyclic refresh parameters (amount/delta-qp)
   // should be done here, before the frame qp is selected.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     vp9_cyclic_refresh_update_parameters(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
-  if (cm->show_frame) update_buffer_level_svc_preencode(cpi);
+  if (cm->show_frame) vp9_update_buffer_level_svc_preencode(cpi);
 
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
       svc->spatial_layer_id == svc->first_spatial_layer_to_encode &&
@@ -2393,6 +2468,11 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
             cpi->resize_scale_num * lc->scaling_factor_num;
         lc->scaling_factor_den_resize =
             cpi->resize_scale_den * lc->scaling_factor_den;
+        // Reset rate control for all temporal layers.
+        lc->rc.buffer_level = lc->rc.optimal_buffer_level;
+        lc->rc.bits_off_target = lc->rc.optimal_buffer_level;
+        lc->rc.rate_correction_factors[INTER_FRAME] =
+            rc->rate_correction_factors[INTER_FRAME];
       }
       // Set the size for this current temporal layer.
       lc = &svc->layer_context[svc->spatial_layer_id *
@@ -2415,7 +2495,8 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
   if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && rc->frames_to_key == 0)) {
+      (cpi->oxcf.auto_key && rc->frames_to_key == 0) ||
+      (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
     cm->frame_type = KEY_FRAME;
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
@@ -2512,26 +2593,25 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
     rc->min_gf_interval = FIXED_GF_INTERVAL;
     rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
   } else {
+    double framerate = cpi->framerate;
     // Set Maximum gf/arf interval
     rc->max_gf_interval = oxcf->max_gf_interval;
     rc->min_gf_interval = oxcf->min_gf_interval;
 #if CONFIG_RATE_CTRL
+    if (oxcf->use_simple_encode_api) {
+      // In this experiment, we avoid framerate being changed dynamically during
+      // encoding.
+      framerate = oxcf->init_framerate;
+    }
+#endif  // CONFIG_RATE_CTRL
     if (rc->min_gf_interval == 0) {
       rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
-          oxcf->width, oxcf->height, oxcf->init_framerate);
+          oxcf->width, oxcf->height, framerate);
     }
     if (rc->max_gf_interval == 0) {
-      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
-          oxcf->init_framerate, rc->min_gf_interval);
+      rc->max_gf_interval =
+          vp9_rc_get_default_max_gf_interval(framerate, rc->min_gf_interval);
     }
-#else
-    if (rc->min_gf_interval == 0)
-      rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
-          oxcf->width, oxcf->height, cpi->framerate);
-    if (rc->max_gf_interval == 0)
-      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
-          cpi->framerate, rc->min_gf_interval);
-#endif
 
     // Extended max interval for genuinely static scenes like slide shows.
     rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
@@ -2547,13 +2627,12 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
       const uint32_t pic_breadth =
           VPXMAX(cpi->common.width, cpi->common.height);
       int i;
-      for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+      for (i = 0; i < VP9_LEVELS; ++i) {
         if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
             vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
           if (rc->min_gf_interval <=
               (int)vp9_level_defs[i].min_altref_distance) {
-            rc->min_gf_interval =
-                (int)vp9_level_defs[i].min_altref_distance + 1;
+            rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance;
             rc->max_gf_interval =
                 VPXMAX(rc->max_gf_interval, rc->min_gf_interval);
           }
@@ -2570,7 +2649,8 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   int vbr_max_bits;
 
-  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->avg_frame_bandwidth =
+      (int)VPXMIN(oxcf->target_bandwidth / cpi->framerate, INT_MAX);
   rc->min_frame_bandwidth =
       (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
 
@@ -2624,7 +2704,7 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
   }
 
   // Fast redistribution of bits arising from massive local undershoot.
-  // Dont do it for kf,arf,gf or overlay frames.
+  // Don't do it for kf,arf,gf or overlay frames.
   if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
       rc->vbr_bits_off_target_fast) {
     int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
@@ -2667,6 +2747,7 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   int min_width = (320 * 4) / 3;
   int min_height = (180 * 4) / 3;
   int down_size_on = 1;
+  int force_downsize_rate = 0;
   cpi->resize_scale_num = 1;
   cpi->resize_scale_den = 1;
   // Don't resize on key frame; reset the counters on key frame.
@@ -2687,11 +2768,32 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   }
 #endif
 
+  // Force downsize based on per-frame-bandwidth, for extreme case,
+  // for HD input.
+  if (cpi->resize_state == ORIG && cm->width * cm->height >= 1280 * 720) {
+    if (rc->avg_frame_bandwidth < 300000 / 30) {
+      resize_action = DOWN_ONEHALF;
+      cpi->resize_state = ONE_HALF;
+      force_downsize_rate = 1;
+    } else if (rc->avg_frame_bandwidth < 400000 / 30) {
+      resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR;
+      cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER;
+      force_downsize_rate = 1;
+    }
+  } else if (cpi->resize_state == THREE_QUARTER &&
+             cm->width * cm->height >= 960 * 540) {
+    if (rc->avg_frame_bandwidth < 300000 / 30) {
+      resize_action = DOWN_ONEHALF;
+      cpi->resize_state = ONE_HALF;
+      force_downsize_rate = 1;
+    }
+  }
+
   // Resize based on average buffer underflow and QP over some window.
   // Ignore samples close to key frame, since QP is usually high after key.
-  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
-    const int window = (int)(4 * cpi->framerate);
-    cpi->resize_avg_qp += cm->base_qindex;
+  if (!force_downsize_rate && cpi->rc.frames_since_key > cpi->framerate) {
+    const int window = VPXMIN(30, (int)(2 * cpi->framerate));
+    cpi->resize_avg_qp += rc->last_q[INTER_FRAME];
     if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
       ++cpi->resize_buffer_underflow;
     ++cpi->resize_count;
@@ -2917,7 +3019,7 @@ static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
         }
       }
     }
-    target = calc_pframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
     vp9_rc_set_frame_target(cpi, target);
   }
   rc->prev_avg_source_sad_lag = avg_source_sad_lag;
@@ -3127,7 +3229,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
           VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
       adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
       rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-      target = calc_pframe_target_size_one_pass_vbr(cpi);
+      target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
       vp9_rc_set_frame_target(cpi, target);
       rc->count_last_scene_change = 0;
     } else {
@@ -3181,11 +3283,9 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
       MODE_INFO **mi = cm->mi_grid_visible;
       int sum_intra_usage = 0;
       int mi_row, mi_col;
-      int tot = 0;
       for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
         for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
           if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++;
-          tot++;
           mi++;
         }
         mi += 8;
@@ -3227,7 +3327,7 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
       int tl = 0;
       int sl = 0;
       SVC *svc = &cpi->svc;
-      for (sl = 0; sl < svc->first_spatial_layer_to_encode; ++sl) {
+      for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) {
         for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
           const int layer =
               LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index c5ffb153c8..48c49e937e 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -27,6 +27,9 @@ extern "C" {
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS 9
 
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
 #define MIN_GF_INTERVAL 4
 #define MAX_GF_INTERVAL 16
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
@@ -195,7 +198,8 @@ typedef struct {
   int use_post_encode_drop;
   // External flag to enable post encode frame dropping, controlled by user.
   int ext_use_post_encode_drop;
-
+  // Flag to disable CBR feature to increase Q on overshoot detection.
+  int disable_overshoot_maxq_cbr;
   int damped_adjustment[RATE_FACTOR_LEVELS];
   double arf_active_best_quality_adjustment_factor;
   int arf_increase_active_best_quality;
@@ -203,6 +207,14 @@ typedef struct {
   int preserve_arf_as_gld;
   int preserve_next_arf_as_gld;
   int show_arf_as_gld;
+
+  // Flag to constrain golden frame interval on key frame frequency for 1 pass
+  // VBR.
+  int constrain_gf_key_freq_onepass_vbr;
+
+  // The index of the current GOP. Start from zero.
+  // When a key frame is inserted, it resets to zero.
+  int gop_global_index;
 } RATE_CONTROL;
 
 struct VP9_COMP;
@@ -254,6 +266,9 @@ void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi);
 void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi);
 int vp9_calc_pframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
 int vp9_calc_iframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
+int vp9_calc_pframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+int vp9_calc_iframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+void vp9_set_gf_update_one_pass_vbr(struct VP9_COMP *const cpi);
 void vp9_update_buffer_level_preencode(struct VP9_COMP *cpi);
 void vp9_rc_get_svc_params(struct VP9_COMP *cpi);
 
@@ -335,6 +350,8 @@ void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
 
 void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
 
+void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 34c74424ce..95c95971c5 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -197,28 +197,74 @@ static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
                                                               128, 144, 144 };
 
+// Configure Vizier RD parameters.
+// Later this function will use passed in command line values.
+void vp9_init_rd_parameters(VP9_COMP *cpi) {
+  RD_CONTROL *const rdc = &cpi->rd_ctrl;
+
+  // When |use_vizier_rc_params| is 1, we expect the rd parameters have been
+  // initialized by the pass in values.
+  // Be careful that parameters below are only initialized to 1, if we do not
+  // pass values to them. It is desired to take care of each parameter when
+  // using |use_vizier_rc_params|.
+  if (cpi->twopass.use_vizier_rc_params) return;
+
+  // Make sure this function is floating point safe.
+  vpx_clear_system_state();
+
+  rdc->rd_mult_inter_qp_fac = 1.0;
+  rdc->rd_mult_arf_qp_fac = 1.0;
+  rdc->rd_mult_key_qp_fac = 1.0;
+}
+
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+  return 4.15 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+  return 4.25 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+  return 4.35 + (0.001 * (double)qindex);
+}
+
 int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
-  // largest dc_quant is 21387, therefore rdmult should always fit in int32_t
+  const RD_CONTROL *rdc = &cpi->rd_ctrl;
   const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
-  uint32_t rdmult = q * q;
-
-  if (cpi->common.frame_type != KEY_FRAME) {
-    if (qindex < 128)
-      rdmult = rdmult * 4;
-    else if (qindex < 190)
-      rdmult = rdmult * 4 + rdmult / 2;
-    else
-      rdmult = rdmult * 3;
+  // largest dc_quant is 21387, therefore rdmult should fit in int32_t
+  int rdmult = q * q;
+
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+    return cpi->ext_ratectrl.ext_rdmult;
+  }
+
+  // Make sure this function is floating point safe.
+  vpx_clear_system_state();
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    double def_rd_q_mult = def_kf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_key_qp_fac);
+  } else if (!cpi->rc.is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    double def_rd_q_mult = def_arf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_arf_qp_fac);
   } else {
-    if (qindex < 64)
-      rdmult = rdmult * 4;
-    else if (qindex <= 128)
-      rdmult = rdmult * 3 + rdmult / 2;
-    else if (qindex < 190)
-      rdmult = rdmult * 4 + rdmult / 2;
-    else
-      rdmult = rdmult * 7 + rdmult / 2;
+    double def_rd_q_mult = def_inter_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_inter_qp_fac);
   }
+
 #if CONFIG_VP9_HIGHBITDEPTH
   switch (cpi->common.bit_depth) {
     case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
@@ -247,6 +293,11 @@ static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) {
 
 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
   int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+    return cpi->ext_ratectrl.ext_rdmult;
+  }
   return modulate_rdmult(cpi, rdmult);
 }
 
@@ -462,22 +513,6 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
-static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
-                              int r_q10[MAX_MB_PLANE],
-                              int d_q10[MAX_MB_PLANE]) {
-  int i;
-  const int one_q10 = 1 << 10;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const int tmp = (xsq_q10[i] >> 2) + 8;
-    const int k = get_msb(tmp) - 3;
-    const int xq = (k << 3) + ((tmp >> k) & 0x7);
-    const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
-    const int b_q10 = one_q10 - a_q10;
-    r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
-    d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
-  }
-}
-
 static const uint32_t MAX_XSQ_Q10 = 245727;
 
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
@@ -503,30 +538,12 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
   }
 }
 
-// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
-// vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
-                                      unsigned int n_log2[MAX_MB_PLANE],
-                                      unsigned int qstep[MAX_MB_PLANE],
-                                      int64_t *rate_sum, int64_t *dist_sum) {
-  int i;
-  int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const uint64_t xsq_q10_64 =
-        (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
-        var[i];
-    xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
-  }
-  model_rd_norm_vec(xsq_q10, r_q10, d_q10);
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    int rate =
-        ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
-    int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
-    *rate_sum += rate;
-    *dist_sum += dist;
-  }
-}
-
+// Disable gcc 12.2 false positive warning.
+// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
                               ENTROPY_CONTEXT t_above[16],
@@ -564,6 +581,9 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
       break;
   }
 }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
 
 void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index 4c04c95482..6c61ae514a 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -101,6 +101,13 @@ typedef enum {
   THR_INTRA,
 } THR_MODES_SUB8X8;
 
+typedef struct {
+  // RD multiplier control factors added for Vizier project.
+  double rd_mult_inter_qp_fac;
+  double rd_mult_arf_qp_fac;
+  double rd_mult_key_qp_fac;
+} RD_CONTROL;
+
 typedef struct RD_OPT {
   // Thresh_mult is used to set a threshold for the rd score. A higher value
   // means that we will accept the best mode so far more often. This number
@@ -114,11 +121,9 @@ typedef struct RD_OPT {
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
 
   int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
 
   int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int RDMULT;
   int RDDIV;
   double r0;
@@ -144,6 +149,8 @@ struct TileDataEnc;
 struct VP9_COMP;
 struct macroblock;
 
+void vp9_init_rd_parameters(struct VP9_COMP *cpi);
+
 int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex);
 
 int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);
@@ -157,11 +164,6 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
-                                      unsigned int n_log2[MAX_MB_PLANE],
-                                      unsigned int qstep[MAX_MB_PLANE],
-                                      int64_t *rate_sum, int64_t *dist_sum);
-
 int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
                             const MACROBLOCKD *const xd);
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 39b99d50c4..974e43c90f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -77,7 +77,7 @@ struct rdcost_block_args {
   int64_t best_rd;
   int exit_early;
   int use_fast_coef_costing;
-  const scan_order *so;
+  const ScanOrder *so;
   uint8_t skippable;
   struct buf_2d *this_recon;
 };
@@ -86,28 +86,28 @@ struct rdcost_block_args {
 
 #if !CONFIG_REALTIME_ONLY
 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  { NEARESTMV, { LAST_FRAME, NONE } },
-  { NEARESTMV, { ALTREF_FRAME, NONE } },
-  { NEARESTMV, { GOLDEN_FRAME, NONE } },
+  { NEARESTMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { DC_PRED, { INTRA_FRAME, NONE } },
+  { DC_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 
-  { NEWMV, { LAST_FRAME, NONE } },
-  { NEWMV, { ALTREF_FRAME, NONE } },
-  { NEWMV, { GOLDEN_FRAME, NONE } },
+  { NEWMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEWMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { NEARMV, { LAST_FRAME, NONE } },
-  { NEARMV, { ALTREF_FRAME, NONE } },
-  { NEARMV, { GOLDEN_FRAME, NONE } },
+  { NEARMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEARMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { ZEROMV, { LAST_FRAME, NONE } },
-  { ZEROMV, { GOLDEN_FRAME, NONE } },
-  { ZEROMV, { ALTREF_FRAME, NONE } },
+  { ZEROMV, { LAST_FRAME, NO_REF_FRAME } },
+  { ZEROMV, { GOLDEN_FRAME, NO_REF_FRAME } },
+  { ZEROMV, { ALTREF_FRAME, NO_REF_FRAME } },
 
   { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-  { TM_PRED, { INTRA_FRAME, NONE } },
+  { TM_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 
   { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -117,20 +117,20 @@ static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
   { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-  { H_PRED, { INTRA_FRAME, NONE } },
-  { V_PRED, { INTRA_FRAME, NONE } },
-  { D135_PRED, { INTRA_FRAME, NONE } },
-  { D207_PRED, { INTRA_FRAME, NONE } },
-  { D153_PRED, { INTRA_FRAME, NONE } },
-  { D63_PRED, { INTRA_FRAME, NONE } },
-  { D117_PRED, { INTRA_FRAME, NONE } },
-  { D45_PRED, { INTRA_FRAME, NONE } },
+  { H_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { V_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D207_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D153_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D63_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D117_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 };
 
 static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
-  { { LAST_FRAME, NONE } },           { { GOLDEN_FRAME, NONE } },
-  { { ALTREF_FRAME, NONE } },         { { LAST_FRAME, ALTREF_FRAME } },
-  { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } },
+  { { LAST_FRAME, NO_REF_FRAME } },   { { GOLDEN_FRAME, NO_REF_FRAME } },
+  { { ALTREF_FRAME, NO_REF_FRAME } }, { { LAST_FRAME, ALTREF_FRAME } },
+  { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NO_REF_FRAME } },
 };
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -160,10 +160,13 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
-                            MACROBLOCKD *xd, int *out_rate_sum,
-                            int64_t *out_dist_sum, int *skip_txfm_sb,
-                            int64_t *skip_sse_sb) {
+// Planewise build inter prediction and compute rdcost with early termination
+// option
+static int build_inter_pred_model_rd_earlyterm(
+    VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x,
+    MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum,
+    int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm,
+    int64_t best_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -176,19 +179,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   int64_t total_sse = 0;
   int skip_flag = 1;
   const int shift = 6;
-  int64_t dist;
   const int dequant_shift =
 #if CONFIG_VP9_HIGHBITDEPTH
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
                                                     3;
-  unsigned int qstep_vec[MAX_MB_PLANE];
-  unsigned int nlog2_vec[MAX_MB_PLANE];
-  unsigned int sum_sse_vec[MAX_MB_PLANE];
-  int any_zero_sum_sse = 0;
 
   x->pred_sse[ref] = 0;
 
+  // Build prediction signal, compute stats and RD cost on per-plane basis
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
@@ -207,7 +206,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     int idx, idy;
     int lw = b_width_log2_lookup[unit_size] + 2;
     int lh = b_height_log2_lookup[unit_size] + 2;
+    unsigned int qstep;
+    unsigned int nlog2;
+    int64_t dist = 0;
 
+    // Build inter predictor
+    vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+
+    // Compute useful stats
     for (idy = 0; idy < bh; ++idy) {
       for (idx = 0; idx < bw; ++idx) {
         uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
@@ -243,46 +249,36 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     }
 
     total_sse += sum_sse;
-    sum_sse_vec[i] = sum_sse;
-    any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0);
-    qstep_vec[i] = pd->dequant[1] >> dequant_shift;
-    nlog2_vec[i] = num_pels_log2_lookup[bs];
-  }
+    qstep = pd->dequant[1] >> dequant_shift;
+    nlog2 = num_pels_log2_lookup[bs];
 
-  // Fast approximate the modelling function.
-  if (cpi->sf.simple_model_rd_from_var) {
-    for (i = 0; i < MAX_MB_PLANE; ++i) {
+    // Fast approximate the modelling function.
+    if (cpi->sf.simple_model_rd_from_var) {
       int64_t rate;
-      const int64_t square_error = sum_sse_vec[i];
-      int quantizer = qstep_vec[i];
-
-      if (quantizer < 120)
-        rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
+      if (qstep < 120)
+        rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT);
       else
         rate = 0;
-      dist = (square_error * quantizer) >> 8;
+      dist = ((int64_t)sum_sse * qstep) >> 8;
       rate_sum += rate;
-      dist_sum += dist;
-    }
-  } else {
-    if (any_zero_sum_sse) {
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        int rate;
-        vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i],
-                                     &rate, &dist);
-        rate_sum += rate;
-        dist_sum += dist;
-      }
     } else {
-      vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec,
-                                       &rate_sum, &dist_sum);
+      int rate;
+      vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist);
+      rate_sum += rate;
+    }
+    dist_sum += dist;
+    if (do_earlyterm) {
+      if (RDCOST(x->rdmult, x->rddiv, rate_sum,
+                 dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd)
+        return 1;
     }
   }
-
   *skip_txfm_sb = skip_flag;
   *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2;
   *out_rate_sum = (int)rate_sum;
   *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2;
+
+  return 0;
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -462,11 +458,6 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   return cost;
 }
 
-static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
-                                  int subsampling_dim, int blk_dim) {
-  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
-}
-
 // Copy all visible 4x4s in the transform block.
 static void copy_block_visible(const MACROBLOCKD *xd,
                                const struct macroblockd_plane *const pd,
@@ -567,47 +558,11 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
   return sse;
 }
 
-// Compute the squares sum squares on all visible 4x4s in the transform block.
-static int64_t sum_squares_visible(const MACROBLOCKD *xd,
-                                   const struct macroblockd_plane *const pd,
-                                   const int16_t *diff, const int diff_stride,
-                                   int blk_row, int blk_col,
-                                   const BLOCK_SIZE plane_bsize,
-                                   const BLOCK_SIZE tx_bsize) {
-  int64_t sse;
-  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
-  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
-  int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
-                                            pd->subsampling_x, blk_col);
-  int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
-                                             pd->subsampling_y, blk_row);
-  if (tx_bsize == BLOCK_4X4 ||
-      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
-    assert(tx_4x4_w == tx_4x4_h);
-    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
-  } else {
-    int r, c;
-    int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
-    int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
-    sse = 0;
-    // if we are in the unrestricted motion border.
-    for (r = 0; r < max_r; ++r) {
-      // Skip visiting the sub blocks that are wholly within the UMV.
-      for (c = 0; c < max_c; ++c) {
-        sse += (int64_t)vpx_sum_squares_2d_i16(
-            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
-      }
-    }
-  }
-  return sse;
-}
-
 static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
                        BLOCK_SIZE plane_bsize, int block, int blk_row,
                        int blk_col, TX_SIZE tx_size, int64_t *out_dist,
-                       int64_t *out_sse, struct buf_2d *out_recon) {
+                       int64_t *out_sse, struct buf_2d *out_recon,
+                       int sse_calc_done) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -633,15 +588,15 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 
     if (x->skip_encode && !is_inter_block(xd->mi[0])) {
       // TODO(jingning): tune the model to better capture the distortion.
-      const int64_t p =
+      const int64_t mean_quant_error =
           (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>
 #if CONFIG_VP9_HIGHBITDEPTH
           (shift + 2 + (bd - 8) * 2);
 #else
           (shift + 2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      *out_dist += (p >> 4);
-      *out_sse += p;
+      *out_dist += (mean_quant_error >> 4);
+      *out_sse += mean_quant_error;
     }
   } else {
     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
@@ -657,8 +612,12 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
     const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
     unsigned int tmp;
 
-    tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
-                    blk_col, plane_bsize, tx_bsize);
+    if (sse_calc_done) {
+      tmp = (unsigned int)(*out_sse);
+    } else {
+      tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
+                      blk_col, plane_bsize, tx_bsize);
+    }
     *out_sse = (int64_t)tmp * 16;
     if (out_recon) {
       const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col);
@@ -745,8 +704,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   MODE_INFO *const mi = xd->mi[0];
   int64_t rd1, rd2, rd;
   int rate;
-  int64_t dist;
-  int64_t sse;
+  int64_t dist = INT64_MAX;
+  int64_t sse = INT64_MAX;
   const int coeff_ctx =
       combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]);
   struct buf_2d *recon = args->this_recon;
@@ -754,20 +713,29 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dst_stride = pd->dst.stride;
   const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+  const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method;
+  const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh;
+  int sse_calc_done = 0;
+#if CONFIG_MISMATCH_DEBUG
+  struct encode_b_args encode_b_arg = {
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip,
+    0,  // mi_row
+    0,  // mi_col
+    0   // output_enabled
+  };
+#else
+  struct encode_b_args encode_b_arg = {
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip
+  };
+#endif
 
   if (args->exit_early) return;
 
   if (!is_inter_block(mi)) {
-#if CONFIG_MISMATCH_DEBUG
-    struct encode_b_args intra_arg = {
-      x, x->block_qcoeff_opt, args->t_above, args->t_left, &mi->skip, 0, 0, 0
-    };
-#else
-    struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above,
-                                       args->t_left, &mi->skip };
-#endif
     vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                           &intra_arg);
+                           &encode_b_arg);
     if (recon) {
       uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
       copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
@@ -775,16 +743,21 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     }
     if (x->block_tx_domain) {
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse, /*recon =*/0);
+                 tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done);
     } else {
       const struct macroblock_plane *const p = &x->plane[plane];
       const int src_stride = p->src.stride;
-      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
       const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
       unsigned int tmp;
-      sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
-                                plane_bsize, tx_bsize);
+      if (!sse_calc_done) {
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+        const int16_t *diff =
+            &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+        int visible_width, visible_height;
+        sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
+                                  plane_bsize, tx_bsize, &visible_width,
+                                  &visible_height);
+      }
 #if CONFIG_VP9_HIGHBITDEPTH
       if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
         sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
@@ -799,14 +772,27 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     if (max_txsize_lookup[plane_bsize] == tx_size)
       skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))];
 
+    // This reduces the risk of bad perceptual quality due to bad prediction.
+    // We always force the encoder to perform transform and quantization.
+    if (!args->cpi->sf.allow_skip_txfm_ac_dc &&
+        skip_txfm_flag == SKIP_TXFM_AC_DC) {
+      skip_txfm_flag = SKIP_TXFM_NONE;
+    }
+
     if (skip_txfm_flag == SKIP_TXFM_NONE ||
         (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
+      const struct macroblock_plane *const p = &x->plane[plane];
+      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+      const int16_t *const diff =
+          &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+      const int use_trellis_opt =
+          do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize,
+                         tx_size, &encode_b_arg);
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
-      if (x->block_qcoeff_opt)
-        vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
+      if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse, recon);
+                 tx_size, &dist, &sse, recon, sse_calc_done);
     } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
@@ -827,17 +813,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
         dist = VPXMAX(0, sse - dc_correct);
       }
     } else {
-      // SKIP_TXFM_AC_DC
-      // skip forward transform. Because this is handled here, the quantization
-      // does not need to do it.
-      x->plane[plane].eobs[block] = 0;
-      sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
-      dist = sse;
-      if (recon) {
-        uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
-        copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
-                           blk_row, blk_col, plane_bsize, tx_bsize);
-      }
+      assert(0 && "allow_skip_txfm_ac_dc does not allow SKIP_TXFM_AC_DC.");
     }
   }
 
@@ -1111,6 +1087,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
 
   xd->mi[0]->tx_size = TX_4X4;
 
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
@@ -1138,7 +1116,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
           int16_t *const src_diff =
               vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+          tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+          tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+          tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+          uint16_t *const eob = &p->eobs[block];
           xd->mi[0]->bmi[block].as_mode = mode;
           vp9_predict_intra_block(xd, 1, TX_4X4, mode,
                                   x->skip_encode ? src : dst,
@@ -1147,11 +1128,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst,
                                     dst_stride, xd->bd);
           if (xd->lossless) {
-            const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+            const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             vp9_highbd_fwht4x4(src_diff, coeff, 8);
-            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
@@ -1162,14 +1144,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           } else {
             int64_t unused;
             const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-            const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+            const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             if (tx_type == DCT_DCT)
               vpx_highbd_fdct4x4(src_diff, coeff, 8);
             else
               vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
-            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             distortion += vp9_highbd_block_error_dispatch(
@@ -1239,7 +1222,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
         int16_t *const src_diff =
             vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
         xd->mi[0]->bmi[block].as_mode = mode;
         vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride, dst,
@@ -1247,11 +1233,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless) {
-          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+          const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fwht4x4(src_diff, coeff, 8);
-          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                         so);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1262,11 +1249,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         } else {
           int64_t unused;
           const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+          const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
-          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                         so);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1403,7 +1391,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
     mic->mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, best_rd, /*recon = */ 0);
+                    bsize, best_rd, /*recon=*/NULL);
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
@@ -1443,7 +1431,6 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   if (ref_best_rd < 0) is_cost_valid = 0;
 
   if (is_inter_block(mi) && is_cost_valid) {
-    int plane;
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       vp9_subtract_plane(x, bsize, plane);
   }
@@ -1456,7 +1443,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
     txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
                      plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
-                     /*recon = */ 0);
+                     /*recon=*/NULL);
     if (pnrate == INT_MAX) {
       is_cost_valid = 0;
       break;
@@ -1639,10 +1626,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
-  const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+  const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
   const int is_compound = has_second_ref(mi);
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
 
+  assert(!x->skip_block);
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const int bw = b_width_log2_lookup[BLOCK_8X8];
     const int h = 4 * (i >> bw);
@@ -1704,18 +1693,25 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
 #endif
       int64_t ssz, rd, rd1, rd2;
-      tran_low_t *coeff;
+      tran_low_t *coeff, *qcoeff, *dqcoeff;
+      uint16_t *eob;
       int coeff_ctx;
       k += (idy * 2 + idx);
       coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]);
       coeff = BLOCK_OFFSET(p->coeff, k);
+      qcoeff = BLOCK_OFFSET(p->qcoeff, k);
+      dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k);
+      eob = &p->eobs[k];
+
       x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                      coeff, 8);
-      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
+      vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            so);
       thisdistortion += vp9_highbd_block_error_dispatch(
           coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
 #else
+      vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, so);
       thisdistortion +=
           vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -1809,7 +1805,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
                               const MV_REFERENCE_FRAME ref_frames[2]) {
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
-      (ref_frames[1] == NONE ||
+      (ref_frames[1] == NO_REF_FRAME ||
        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
     int rfc = mode_context[ref_frames[0]];
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
@@ -1822,7 +1818,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
       if (c2 > c3) return 0;
     } else {
       assert(this_mode == ZEROMV);
-      if (ref_frames[1] == NONE) {
+      if (ref_frames[1] == NO_REF_FRAME) {
         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
           return 0;
@@ -1838,10 +1834,80 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
   return 1;
 }
 
+static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
+  if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
+    int_mv cur_fullpel_mv, prev_fullpel_mv;
+    cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
+    cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3;
+    prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3;
+    prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3;
+    if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1;
+  }
+  return 0;
+}
+
+// Compares motion vector and mode rate of current mode and given mode.
+static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv,
+                                       int this_mode_rate, int mode_rate,
+                                       int mv_thresh) {
+  const int mv_diff =
+      abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row);
+  if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1;
+  return 0;
+}
+
+// Skips single reference inter modes NEARMV and ZEROMV based on motion vector
+// difference and mode rate.
+static INLINE int skip_single_mode_based_on_mode_rate(
+    int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode,
+    int ref0, int this_mode_rate, int best_mode_index) {
+  MV this_mv = mode_mv[this_mode][ref0].as_mv;
+  const int mv_thresh = 3;
+
+  // Pruning is not applicable for NEARESTMV or NEWMV modes.
+  if (this_mode == NEARESTMV || this_mode == NEWMV) return 0;
+  // Pruning is not done when reference frame of the mode is same as best
+  // reference so far.
+  if (best_mode_index > 0 &&
+      ref0 == vp9_mode_order[best_mode_index].ref_frame[0])
+    return 0;
+
+  // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV
+  if (compare_mv_mode_rate(
+          this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate,
+          single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh))
+    return 1;
+
+  // Check absolute mv difference and mode rate of current mode w.r.t NEWMV
+  if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate,
+                           single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh))
+    return 1;
+
+  // Pruning w.r.t NEARMV is applicable only for ZEROMV mode
+  if (this_mode == NEARMV) return 0;
+  // Check absolute mv difference and mode rate of current mode w.r.t NEARMV
+  if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate,
+                           single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh))
+    return 1;
+  return 0;
+}
+
+#define MAX_JOINT_MV_SEARCH_ITERS 4
+static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) {
+  int num_iters = MAX_JOINT_MV_SEARCH_ITERS;  // sf_level = 0
+  if (sf_level >= 2)
+    num_iters = 0;
+  else if (sf_level >= 1)
+    num_iters = bsize < BLOCK_8X8
+                    ? 0
+                    : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS);
+  return num_iters;
+}
+
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                 int_mv *frame_mv, int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv) {
+                                int *rate_mv, int num_iters) {
   const VP9_COMMON *const cm = &cpi->common;
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -1850,6 +1916,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   const int refs[2] = { mi->ref_frame[0],
                         mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] };
   int_mv ref_mv[2];
+  int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2];
   int ite, ref;
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
   struct scale_factors sf;
@@ -1864,12 +1931,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
 // Prediction buffer from second frame.
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]);
   uint8_t *second_pred;
 #else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+  DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  // Check number of iterations do not exceed the max
+  assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS);
+
   for (ref = 0; ref < 2; ++ref) {
     ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
 
@@ -1885,6 +1955,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     }
 
     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+    iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int;
   }
 
 // Since we have scaled the reference frames to match the size of the current
@@ -1899,7 +1970,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
   // Allow joint search multiple times iteratively for each reference frame
   // and break out of the search loop if it couldn't find a better mv.
-  for (ite = 0; ite < 4; ite++) {
+  for (ite = 0; ite < num_iters; ite++) {
     struct buf_2d ref_yv12[2];
     uint32_t bestsme = UINT_MAX;
     int sadpb = x->sadperbit16;
@@ -1911,6 +1982,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                        // odd iterations search in the second. The predictor
                        // found for the 'other' reference frame is factored in.
 
+    // Skip further iterations of search if in the previous iteration, the
+    // motion vector of the searched ref frame is unchanged, and the other ref
+    // frame's full-pixel mv is unchanged.
+    if (skip_iters(iter_mvs, ite, id)) break;
+
     // Initialized here because of compiler problem in Visual Studio.
     ref_yv12[0] = xd->plane[0].pre[0];
     ref_yv12[1] = xd->plane[0].pre[1];
@@ -1976,6 +2052,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     } else {
       break;
     }
+    if (ite < num_iters - 1) {
+      iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int;
+      iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int;
+    }
   }
 
   *rate_mv = 0;
@@ -1996,7 +2076,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
 static int64_t rd_pick_best_sub8x8_mode(
     VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv,
-    int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate,
+    int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate,
     int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
     int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf,
     int filter_idx, int mi_row, int mi_col) {
@@ -2029,7 +2109,7 @@ static int64_t rd_pick_best_sub8x8_mode(
 
   vp9_zero(*bsi);
 
-  bsi->segment_rd = best_rd;
+  bsi->segment_rd = best_rd_so_far;
   bsi->ref_mv[0] = best_ref_mv;
   bsi->ref_mv[1] = second_best_ref_mv;
   bsi->mvp.as_int = best_ref_mv->as_int;
@@ -2055,14 +2135,14 @@ static int64_t rd_pick_best_sub8x8_mode(
       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
       PREDICTION_MODE mode_selected = ZEROMV;
       int64_t best_rd = INT64_MAX;
-      const int i = idy * 2 + idx;
+      const int block = idy * 2 + idx;
       int ref;
 
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
         const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
         frame_mv[ZEROMV][frame].as_int = 0;
         vp9_append_sub8x8_mvs_for_idx(
-            cm, xd, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
+            cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
             &frame_mv[NEARMV][frame], mbmi_ext->mode_context);
       }
 
@@ -2072,7 +2152,7 @@ static int64_t rd_pick_best_sub8x8_mode(
         struct buf_2d orig_pre[2];
 
         mode_idx = INTER_OFFSET(this_mode);
-        bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+        bsi->rdstat[block][mode_idx].brdcost = INT64_MAX;
         if (!(inter_mode_mask & (1 << this_mode))) continue;
 
         if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
@@ -2080,14 +2160,14 @@ static int64_t rd_pick_best_sub8x8_mode(
           continue;
 
         memcpy(orig_pre, pd->pre, sizeof(orig_pre));
-        memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
-               sizeof(bsi->rdstat[i][mode_idx].ta));
-        memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
-               sizeof(bsi->rdstat[i][mode_idx].tl));
+        memcpy(bsi->rdstat[block][mode_idx].ta, t_above,
+               sizeof(bsi->rdstat[block][mode_idx].ta));
+        memcpy(bsi->rdstat[block][mode_idx].tl, t_left,
+               sizeof(bsi->rdstat[block][mode_idx].tl));
 
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
-            seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) {
+            seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) {
           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
           uint32_t bestsme = UINT_MAX;
@@ -2097,18 +2177,19 @@ static int64_t rd_pick_best_sub8x8_mode(
           int cost_list[5];
           const MvLimits tmp_mv_limits = x->mv_limits;
 
-          /* Is the best so far sufficiently good that we cant justify doing
+          /* Is the best so far sufficiently good that we can't justify doing
            * and new motion search. */
           if (best_rd < label_mv_thresh) break;
 
           if (cpi->oxcf.mode != BEST) {
             // use previous block's result as next block's MV predictor.
-            if (i > 0) {
-              bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
-              if (i == 2) bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
+            if (block > 0) {
+              bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int;
+              if (block == 2)
+                bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int;
             }
           }
-          if (i == 0)
+          if (block == 0)
             max_mv = x->max_mv_context[mi->ref_frame[0]];
           else
             max_mv =
@@ -2137,7 +2218,7 @@ static int64_t rd_pick_best_sub8x8_mode(
           }
 
           // adjust src pointer for this block
-          mi_buf_shift(x, i);
+          mi_buf_shift(x, block);
 
           vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
 
@@ -2160,7 +2241,7 @@ static int64_t rd_pick_best_sub8x8_mode(
                 cpi->sf.use_accurate_subpel_search);
 
             // save motion search result for use in compound prediction
-            seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv;
+            seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv;
           }
 
           x->pred_mv[mi->ref_frame[0]] = *new_mv;
@@ -2170,40 +2251,44 @@ static int64_t rd_pick_best_sub8x8_mode(
         }
 
         if (has_second_rf) {
-          if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV ||
-              seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV)
+          if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV)
             continue;
         }
 
         if (has_second_rf && this_mode == NEWMV &&
             mi->interp_filter == EIGHTTAP) {
+          // Decide number of joint motion search iterations
+          const int num_joint_search_iters = get_joint_search_iters(
+              cpi->sf.comp_inter_joint_search_iter_level, bsize);
           // adjust src pointers
-          mi_buf_shift(x, i);
-          if (sf->comp_inter_joint_search_thresh <= bsize) {
+          mi_buf_shift(x, block);
+          if (num_joint_search_iters) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
-                                mi_col, seg_mvs[i], &rate_mv);
-            seg_mvs[i][mi->ref_frame[0]].as_int =
+                                mi_col, seg_mvs[block], &rate_mv,
+                                num_joint_search_iters);
+            seg_mvs[block][mi->ref_frame[0]].as_int =
                 frame_mv[this_mode][mi->ref_frame[0]].as_int;
-            seg_mvs[i][mi->ref_frame[1]].as_int =
+            seg_mvs[block][mi->ref_frame[1]].as_int =
                 frame_mv[this_mode][mi->ref_frame[1]].as_int;
           }
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        bsi->rdstat[i][mode_idx].brate = set_and_cost_bmi_mvs(
-            cpi, x, xd, i, this_mode, mode_mv[this_mode], frame_mv, seg_mvs[i],
-            bsi->ref_mv, x->nmvjointcost, x->mvcost);
+        bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs(
+            cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv,
+            seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
-          bsi->rdstat[i][mode_idx].mvs[ref].as_int =
+          bsi->rdstat[block][mode_idx].mvs[ref].as_int =
               mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
+            bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
+            bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
         }
 
@@ -2221,7 +2306,7 @@ static int64_t rd_pick_best_sub8x8_mode(
           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
             have_ref &= mode_mv[this_mode][ref].as_int ==
-                        ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+                        ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
           }
 
           if (filter_idx > 1 && !subpelmv && !have_ref) {
@@ -2229,53 +2314,55 @@ static int64_t rd_pick_best_sub8x8_mode(
             have_ref = 1;
             for (ref = 0; ref < 1 + has_second_rf; ++ref)
               have_ref &= mode_mv[this_mode][ref].as_int ==
-                          ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+                          ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
           }
 
           if (!subpelmv && have_ref &&
-              ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-            memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
-                   sizeof(SEG_RDSTAT));
+              ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+            memcpy(&bsi->rdstat[block][mode_idx],
+                   &ref_bsi->rdstat[block][mode_idx], sizeof(SEG_RDSTAT));
             if (num_4x4_blocks_wide > 1)
-              bsi->rdstat[i + 1][mode_idx].eobs =
-                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+              bsi->rdstat[block + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[block + 1][mode_idx].eobs;
             if (num_4x4_blocks_high > 1)
-              bsi->rdstat[i + 2][mode_idx].eobs =
-                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+              bsi->rdstat[block + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[block + 2][mode_idx].eobs;
 
-            if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+            if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
               mode_selected = this_mode;
-              best_rd = bsi->rdstat[i][mode_idx].brdcost;
+              best_rd = bsi->rdstat[block][mode_idx].brdcost;
             }
             continue;
           }
         }
 
-        bsi->rdstat[i][mode_idx].brdcost = encode_inter_mb_segment(
-            cpi, x, bsi->segment_rd - this_segment_rd, i,
-            &bsi->rdstat[i][mode_idx].byrate, &bsi->rdstat[i][mode_idx].bdist,
-            &bsi->rdstat[i][mode_idx].bsse, bsi->rdstat[i][mode_idx].ta,
-            bsi->rdstat[i][mode_idx].tl, mi_row, mi_col);
-        if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-          bsi->rdstat[i][mode_idx].brdcost +=
-              RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0);
-          bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
+        bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment(
+            cpi, x, bsi->segment_rd - this_segment_rd, block,
+            &bsi->rdstat[block][mode_idx].byrate,
+            &bsi->rdstat[block][mode_idx].bdist,
+            &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta,
+            bsi->rdstat[block][mode_idx].tl, mi_row, mi_col);
+        if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+          bsi->rdstat[block][mode_idx].brdcost += RDCOST(
+              x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0);
+          bsi->rdstat[block][mode_idx].brate +=
+              bsi->rdstat[block][mode_idx].byrate;
+          bsi->rdstat[block][mode_idx].eobs = p->eobs[block];
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
+            bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1];
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
+            bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2];
         }
 
-        if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+        if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
           mode_selected = this_mode;
-          best_rd = bsi->rdstat[i][mode_idx].brdcost;
+          best_rd = bsi->rdstat[block][mode_idx].brdcost;
         }
       } /*for each 4x4 mode*/
 
       if (best_rd == INT64_MAX) {
         int iy, midx;
-        for (iy = i + 1; iy < 4; ++iy)
+        for (iy = block + 1; iy < 4; ++iy)
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
@@ -2283,22 +2370,22 @@ static int64_t rd_pick_best_sub8x8_mode(
       }
 
       mode_idx = INTER_OFFSET(mode_selected);
-      memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
-      memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+      memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above));
+      memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left));
 
-      set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected],
-                           frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
-                           x->mvcost);
+      set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected,
+                           mode_mv[mode_selected], frame_mv, seg_mvs[block],
+                           bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
-      br += bsi->rdstat[i][mode_idx].brate;
-      bd += bsi->rdstat[i][mode_idx].bdist;
-      block_sse += bsi->rdstat[i][mode_idx].bsse;
-      segmentyrate += bsi->rdstat[i][mode_idx].byrate;
-      this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
+      br += bsi->rdstat[block][mode_idx].brate;
+      bd += bsi->rdstat[block][mode_idx].bdist;
+      block_sse += bsi->rdstat[block][mode_idx].bsse;
+      segmentyrate += bsi->rdstat[block][mode_idx].byrate;
+      this_segment_rd += bsi->rdstat[block][mode_idx].brdcost;
 
       if (this_segment_rd > bsi->segment_rd) {
         int iy, midx;
-        for (iy = i + 1; iy < 4; ++iy)
+        for (iy = block + 1; iy < 4; ++iy)
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
@@ -2316,7 +2403,7 @@ static int64_t rd_pick_best_sub8x8_mode(
   // update the coding decisions
   for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
 
-  if (bsi->segment_rd > best_rd) return INT64_MAX;
+  if (bsi->segment_rd > best_rd_so_far) return INT64_MAX;
   /* set it to the best */
   for (i = 0; i < 4; i++) {
     mode_idx = INTER_OFFSET(bsi->modes[i]);
@@ -2561,9 +2648,9 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
           tmp_mv->as_int = INVALID_MV;
 
           if (scaled_ref_frame) {
-            int i;
-            for (i = 0; i < MAX_MB_PLANE; ++i)
-              xd->plane[i].pre[0] = backup_yv12[i];
+            int j;
+            for (j = 0; j < MAX_MB_PLANE; ++j)
+              xd->plane[j].pre[0] = backup_yv12[j];
           }
           return;
         }
@@ -2728,8 +2815,9 @@ static int64_t handle_inter_mode(
     struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES],
     int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
     INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
-    int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse,
-    const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) {
+    int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate,
+    int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter,
+    int64_t filter_cache[], int best_mode_index) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
@@ -2747,9 +2835,8 @@ static int64_t handle_inter_mode(
 #else
   DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  int pred_exists = 0;
   int intpel_mv;
-  int64_t rd, tmp_rd, best_rd = INT64_MAX;
+  int64_t rd, tmp_rd = INT64_MAX, best_rd = INT64_MAX;
   int best_needs_copy = 0;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
@@ -2758,13 +2845,12 @@ static int64_t handle_inter_mode(
   uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 };
   int64_t bsse[MAX_MB_PLANE << 2] = { 0 };
 
-  int bsl = mi_width_log2_lookup[bsize];
-  int pred_filter_search =
-      cpi->sf.cb_pred_filter_search
-          ? (((mi_row + mi_col) >> bsl) +
-             get_chessboard_index(cm->current_video_frame)) &
-                0x1
-          : 0;
+  const int bsl = mi_width_log2_lookup[bsize];
+  const int blk_parity = (((mi_row + mi_col) >> bsl) +
+                          get_chessboard_index(cm->current_video_frame)) &
+                         0x1;
+  const int pred_filter_search =
+      (cpi->sf.cb_pred_filter_search >= 2) && blk_parity;
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
@@ -2803,13 +2889,23 @@ static int64_t handle_inter_mode(
   if (this_mode == NEWMV) {
     int rate_mv;
     if (is_comp_pred) {
+      // Decide number of joint motion search iterations
+      const int num_joint_search_iters = get_joint_search_iters(
+          cpi->sf.comp_inter_joint_search_iter_level, bsize);
+
       // Initialize mv using single prediction mode result.
       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
-      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+      if (num_joint_search_iters) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, joint_motion_search_time);
+#endif
         joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
-                            single_newmv, &rate_mv);
+                            single_newmv, &rate_mv, num_joint_search_iters);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, joint_motion_search_time);
+#endif
       } else {
         rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
@@ -2821,7 +2917,13 @@ static int64_t handle_inter_mode(
       *rate2 += rate_mv;
     } else {
       int_mv tmp_mv;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, single_motion_search_time);
+#endif
       single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, single_motion_search_time);
+#endif
       if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;
 
       frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
@@ -2875,23 +2977,45 @@ static int64_t handle_inter_mode(
     *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
   }
 
+  if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) {
+    single_mode_rate[INTER_OFFSET(this_mode)] = *rate2;
+    // Prune NEARMV and ZEROMV modes based on motion vector difference and mode
+    // rate.
+    if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate,
+                                            this_mode, refs[0], *rate2,
+                                            best_mode_index)) {
+      // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes
+      // are not early terminated. This ensures all single modes are not getting
+      // skipped when the speed feature is enabled.
+      assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX ||
+             single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX);
+      return INT64_MAX;
+    }
+  }
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
       mi->mode != NEARESTMV)
     return INT64_MAX;
 
-  pred_exists = 0;
   // Are all MVs integer pel for Y and UV
   intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
   if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, interp_filter_time);
+#endif
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
 
   if (cm->interp_filter != BILINEAR) {
+    // Use cb pattern for filter eval when filter is not switchable
+    const int enable_interp_search =
+        (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE)
+            ? blk_parity
+            : 1;
     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
       best_filter = EIGHTTAP;
-    } else if (best_filter == SWITCHABLE) {
+    } else if (best_filter == SWITCHABLE && enable_interp_search) {
       int newbest;
       int tmp_rate_sum = 0;
       int64_t tmp_dist_sum = 0;
@@ -2901,6 +3025,9 @@ static int64_t handle_inter_mode(
         int64_t rs_rd;
         int tmp_skip_sb = 0;
         int64_t tmp_skip_sse = INT64_MAX;
+        const int enable_earlyterm =
+            cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i;
+        int64_t filt_best_rd;
 
         mi->interp_filter = i;
         rs = vp9_get_switchable_rate(cpi, xd);
@@ -2934,9 +3061,16 @@ static int64_t handle_inter_mode(
               xd->plane[j].dst.stride = 64;
             }
           }
-          vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb,
-                          &tmp_skip_sse);
+
+          filt_best_rd =
+              cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
+          if (build_inter_pred_model_rd_earlyterm(
+                  cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum,
+                  &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm,
+                  filt_best_rd)) {
+            filter_cache[i] = INT64_MAX;
+            continue;
+          }
 
           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
           filter_cache[i] = rd;
@@ -2969,7 +3103,6 @@ static int64_t handle_inter_mode(
         if ((cm->interp_filter == SWITCHABLE && newbest) ||
             (cm->interp_filter != SWITCHABLE &&
              cm->interp_filter == mi->interp_filter)) {
-          pred_exists = 1;
           tmp_rd = best_rd;
 
           skip_txfm_sb = tmp_skip_sb;
@@ -2981,12 +3114,15 @@ static int64_t handle_inter_mode(
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
     }
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, interp_filter_time);
+#endif
   // Set the appropriate filter
   mi->interp_filter =
       cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
 
-  if (pred_exists) {
+  if (tmp_rd != INT64_MAX) {
     if (best_needs_copy) {
       // again temporarily set the buffers to local memory to prevent a memcpy
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -3001,9 +3137,9 @@ static int64_t handle_inter_mode(
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
-                    &skip_sse_sb);
+    build_inter_pred_model_rd_earlyterm(
+        cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
+        &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
     memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
     memcpy(bsse, x->bsse, sizeof(bsse));
@@ -3096,7 +3232,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   x->skip_encode = 0;
   ctx->skip = 0;
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-  xd->mi[0]->ref_frame[1] = NONE;
+  xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
   // Initialize interp_filter here so we do not have to check for inter block
   // modes in get_pred_context_switchable_interp()
   xd->mi[0]->interp_filter = SWITCHABLE_FILTERS;
@@ -3245,6 +3381,7 @@ int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
   // For two pass account for any formatting bars detected.
   if (cpi->oxcf.pass == 2) {
     TWO_PASS *twopass = &cpi->twopass;
+    vpx_clear_system_state();
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.
@@ -3272,6 +3409,7 @@ int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
   // For two pass account for any formatting bars detected.
   if (cpi->oxcf.pass == 2) {
     TWO_PASS *twopass = &cpi->twopass;
+    vpx_clear_system_state();
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.
@@ -3318,8 +3456,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
+  int single_mode_rate[MAX_REF_FRAMES][INTER_MODES];
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
@@ -3395,7 +3532,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) &&
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
         !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -3406,7 +3543,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
       // Skip checking missing references in both single and compound reference
       // modes. Note that a mode will be skipped if both reference frames
       // are masked out.
@@ -3469,13 +3606,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
   }
 
-  if (bsize > sf->max_intra_bsize) {
+  if (bsize > sf->max_intra_bsize && cpi->ref_frame_flags != 0) {
     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
   }
 
   mode_skip_mask[INTRA_FRAME] |=
-      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+      (uint16_t) ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
   for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
 
@@ -3518,6 +3655,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
     vp9_zero(x->sum_y_eobs);
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (!comp_pred && ref_frame != INTRA_FRAME &&
+        sf->prune_single_mode_based_on_mv_diff_mode_rate)
+      single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX;
 
     if (is_rect_partition) {
       if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue;
@@ -3536,7 +3677,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
           break;
         case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break;
-        case NONE:
+        case NO_REF_FRAME:
         case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
       }
     }
@@ -3569,7 +3710,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       MODE_INFO *ref_mi;
       int const_motion = 1;
       int skip_ref_frame = !cb_partition_search_ctrl;
-      MV_REFERENCE_FRAME rf = NONE;
+      MV_REFERENCE_FRAME rf = NO_REF_FRAME;
       int_mv ref_mv;
       ref_mv.as_int = INVALID_MV;
 
@@ -3586,7 +3727,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
       if ((mi_col - 1) >= tile_info->mi_col_start) {
         if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0];
-        if (rf == NONE) rf = xd->mi[-1]->ref_frame[0];
+        if (rf == NO_REF_FRAME) rf = xd->mi[-1]->ref_frame[0];
         for (i = 0; i < mi_height; ++i) {
           ref_mi = xd->mi[i * xd->mi_stride - 1];
           const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
@@ -3603,7 +3744,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (this_mode == NEARMV || this_mode == ZEROMV) continue;
     }
 
-    comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
 
@@ -3612,7 +3752,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         continue;
 
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
 
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
@@ -3682,19 +3823,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
       memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
                       best_rd, recon);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
       if (rate_y == INT_MAX) continue;
 
       uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
                               [pd->subsampling_y];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
                              &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
                              &skip_uv[uv_tx], &mode_uv[uv_tx]);
       }
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
       rate_uv = rate_uv_tokenonly[uv_tx];
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
@@ -3705,11 +3857,18 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, handle_inter_mode_time);
+#endif
       this_rd = handle_inter_mode(
           cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
           recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
-          single_inter_filter, single_skippable, &total_sse, best_rd,
-          &mask_filter, filter_cache);
+          single_inter_filter, single_skippable,
+          &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter,
+          filter_cache, best_mode_index);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, handle_inter_mode_time);
+#endif
       if (this_rd == INT64_MAX) continue;
 
       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
@@ -3945,13 +4104,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
-// If adaptive interp filter is enabled, then the current leaf node of 8x8
-// data is needed for sub8x8. Hence preserve the context.
-#if CONFIG_CONSISTENT_RECODE
+    // If adaptive interp filter is enabled, then the current leaf node of 8x8
+    // data is needed for sub8x8. Hence preserve the context.
     if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#else
-    if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#endif
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
@@ -4066,7 +4221,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
   mi->mode = ZEROMV;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->mv[0].as_int = 0;
   x->skip = 1;
 
@@ -4143,8 +4298,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
@@ -4194,7 +4347,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     } else {
@@ -4213,7 +4366,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable = 0;
-    int i;
     int this_skip2 = 0;
     int64_t total_sse = INT_MAX;
     int early_term = 0;
@@ -4251,7 +4403,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
           case ALTREF_FRAME:
             ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
             break;
-          case NONE:
+          case NO_REF_FRAME:
           case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
         }
       }
@@ -4279,7 +4431,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
           cm->ref_frame_sign_bias[second_ref_frame])
         continue;
 
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
@@ -4373,7 +4526,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
                 : NULL;
 
         if (scaled_ref_frame[ref]) {
-          int i;
           // Swap out the reference frame for a version that's been scaled to
           // match the resolution of the current frame, allowing the existing
           // motion search code to be used without additional modifications.
@@ -4443,6 +4595,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
               tmp_best_sse = total_sse;
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mi;
+              x->sum_y_eobs[TX_4X4] = 0;
               for (i = 0; i < 4; i++) {
                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
@@ -4476,6 +4629,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
             &rate, &rate_y, &distortion, &skippable, &total_sse,
             (int)this_rd_thresh, seg_mvs, bsi, 0, mi_row, mi_col);
         if (tmp_rd == INT64_MAX) continue;
+        x->sum_y_eobs[TX_4X4] = 0;
+        for (i = 0; i < 4; i++) {
+          x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
+          x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i];
+        }
       } else {
         total_sse = tmp_best_sse;
         rate = tmp_best_rate;
@@ -4504,14 +4662,13 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
 
       if (tmp_best_rdu > 0) {
         // If even the 'Y' rd value of split is higher than best so far
-        // then dont bother looking at UV
+        // then don't bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8);
         memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                               &uv_sse, BLOCK_8X8, tmp_best_rdu)) {
           for (ref = 0; ref < 2; ++ref) {
             if (scaled_ref_frame[ref]) {
-              int i;
               for (i = 0; i < MAX_MB_PLANE; ++i)
                 xd->plane[i].pre[ref] = backup_yv12[ref][i];
             }
@@ -4528,7 +4685,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
       for (ref = 0; ref < 2; ++ref) {
         if (scaled_ref_frame[ref]) {
           // Restore the prediction frame pointers to their unscaled versions.
-          int i;
           for (i = 0; i < MAX_MB_PLANE; ++i)
             xd->plane[i].pre[ref] = backup_yv12[ref][i];
         }
@@ -4734,7 +4890,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
   }
   // If the second reference does not exist, set the corresponding mv to zero.
-  if (mi->ref_frame[1] == NONE) {
+  if (mi->ref_frame[1] == NO_REF_FRAME) {
     mi->mv[1].as_int = 0;
     for (i = 0; i < 4; ++i) {
       mi->bmi[i].as_mv[1].as_int = 0;
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index 7486dee25b..ca55ec9886 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -360,6 +360,12 @@ static int get_down2_steps(int in_length, int out_length) {
   while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
     ++steps;
     in_length = proj_in_length;
+    if (in_length == 1) {
+      // Special case: we break because any further calls to get_down2_length()
+      // with be with length == 1, which return 1, resulting in an infinite
+      // loop.
+      break;
+    }
   }
   return steps;
 }
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index a163297e6e..d75488a8e6 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -39,7 +39,7 @@ void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
 }
 void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
-  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+  seg->feature_mask[segment_id] &= ~(1u << feature_id);
 }
 
 void vp9_clear_segdata(struct segmentation *seg, int segment_id,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 5bbe549236..56fb5f94f4 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -16,8 +16,11 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 
 // Mesh search patters for various speed settings
-static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
-  { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
+// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non
+// FC_GRAPHICS_ANIMATION content type.
+static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = {
+  { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } },
+  { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
 };
 
 #if !CONFIG_REALTIME_ONLY
@@ -39,7 +42,7 @@ static int frame_is_boosted(const VP9_COMP *cpi) {
 // Sets a partition size down to which the auto partition code will always
 // search (can go lower), based on the image dimensions. The logic here
 // is that the extent to which ringing artefacts are offensive, depends
-// partly on the screen area that over which they propogate. Propogation is
+// partly on the screen area that over which they propagate. Propagation is
 // limited by transform block size but the screen area take up by a given block
 // size will be larger for a small image format stretched to full screen.
 static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) {
@@ -67,6 +70,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   const int is_720p_or_larger = min_frame_size >= 720;
   const int is_1080p_or_larger = min_frame_size >= 1080;
   const int is_2160p_or_larger = min_frame_size >= 2160;
+  const int boosted = frame_is_boosted(cpi);
 
   // speed 0 features
   sf->partition_search_breakout_thr.dist = (1 << 20);
@@ -78,9 +82,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
     // Currently, the machine-learning based partition search early termination
     // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
     sf->rd_ml_partition.search_early_termination = 1;
+    sf->recode_tolerance_high = 45;
   } else {
     sf->use_square_only_thresh_high = BLOCK_32X32;
   }
+  if (is_720p_or_larger) {
+    sf->alt_ref_search_fp = 1;
+  }
 
   if (!is_1080p_or_larger) {
     sf->rd_ml_partition.search_breakout = 1;
@@ -95,6 +103,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
     }
   }
 
+  if (!is_720p_or_larger) {
+    if (is_480p_or_larger)
+      sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1;
+    else
+      sf->prune_single_mode_based_on_mv_diff_mode_rate = 1;
+  }
+
   if (speed >= 1) {
     sf->rd_ml_partition.search_early_termination = 0;
     sf->rd_ml_partition.search_breakout = 1;
@@ -152,7 +167,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
       sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
       sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
       sf->alt_ref_search_fp = 1;
-      sf->cb_pred_filter_search = 1;
+      sf->cb_pred_filter_search = 2;
       sf->adaptive_interp_filter_search = 1;
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
     }
@@ -209,15 +224,32 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   const int boosted = frame_is_boosted(cpi);
   int i;
 
-  sf->tx_size_search_breakout = 1;
+  sf->adaptive_interp_filter_search = 1;
+  sf->adaptive_pred_interp_filter = 1;
   sf->adaptive_rd_thresh = 1;
   sf->adaptive_rd_thresh_row_mt = 0;
   sf->allow_skip_recode = 1;
   sf->less_rectangular_check = 1;
-  sf->use_square_partition_only = !boosted;
+  sf->mv.auto_mv_step_size = 1;
+  sf->mv.use_downsampled_sad = 1;
   sf->prune_ref_frame_for_rect_partitions = 1;
-  sf->rd_ml_partition.var_pruning = 1;
+  sf->temporal_filter_search_method = NSTEP;
+  sf->tx_size_search_breakout = 1;
+  sf->use_square_partition_only = !boosted;
+  sf->early_term_interp_search_plane_rd = 1;
+  sf->cb_pred_filter_search = 1;
+  sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+                                     ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE
+                                     : DISABLE_TRELLIS_OPT;
+  sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0;
+
+  sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+  sf->comp_inter_joint_search_iter_level = 1;
+
+  // Reference masking is not supported in dynamic scaling mode.
+  sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC;
 
+  sf->rd_ml_partition.var_pruning = 1;
   sf->rd_ml_partition.prune_rect_thresh[0] = -1;
   sf->rd_ml_partition.prune_rect_thresh[1] = 350;
   sf->rd_ml_partition.prune_rect_thresh[2] = 325;
@@ -238,7 +270,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   }
 
   if (speed >= 1) {
-    sf->temporal_filter_search_method = NSTEP;
     sf->rd_ml_partition.var_pruning = !boosted;
     sf->rd_ml_partition.prune_rect_thresh[1] = 225;
     sf->rd_ml_partition.prune_rect_thresh[2] = 225;
@@ -258,19 +289,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
 
     sf->allow_txfm_domain_distortion = 1;
     sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5];
-    sf->allow_quant_coeff_opt = sf->optimize_coefficients;
-    sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5];
+    sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+                                       ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR
+                                       : DISABLE_TRELLIS_OPT;
+    sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5];
     sf->less_rectangular_check = 1;
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
-    sf->mv.auto_mv_step_size = 1;
     sf->adaptive_rd_thresh = 2;
     sf->mv.subpel_search_level = 1;
     if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
-    sf->adaptive_pred_interp_filter = 1;
     sf->allow_acl = 0;
 
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     if (cpi->oxcf.content != VP9E_CONTENT_FILM) {
       sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
@@ -296,18 +326,14 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->tx_size_search_method =
         frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
 
-    // Reference masking is not supported in dynamic scaling mode.
-    sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0;
-
     sf->mode_search_skip_flags =
         (cm->frame_type == KEY_FRAME)
             ? 0
             : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
                   FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
     sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->comp_inter_joint_search_iter_level = 2;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 45;
     sf->enhanced_full_pixel_motion_search = 0;
     sf->prune_ref_frame_for_rect_partitions = 0;
@@ -337,15 +363,13 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->adaptive_pred_interp_filter = 0;
     sf->adaptive_mode_search = 1;
     sf->cb_partition_search = !boosted;
-    sf->cb_pred_filter_search = 1;
+    sf->cb_pred_filter_search = 2;
     sf->alt_ref_search_fp = 1;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
-    sf->adaptive_interp_filter_search = 1;
-    sf->allow_partition_search_skip = 1;
 
     if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
       for (i = 0; i < MAX_MESH_STEP; ++i) {
@@ -374,7 +398,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   }
 
   if (speed >= 5) {
-    int i;
     sf->optimize_coefficients = 0;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
@@ -462,8 +485,8 @@ static void set_rt_speed_feature_framesize_independent(
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
     sf->tx_domain_thresh = 0.0;
-    sf->allow_quant_coeff_opt = 0;
-    sf->quant_opt_thresh = 0.0;
+    sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT;
+    sf->trellis_opt_tx_rd.thresh = 0.0;
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check = 1;
     sf->tx_size_search_method =
@@ -496,11 +519,10 @@ static void set_rt_speed_feature_framesize_independent(
         (cpi->external_resize == 1 ||
          cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
       MV_REFERENCE_FRAME ref_frame;
-      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                        VP9_ALT_FLAG };
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+        if (yv12 != NULL &&
+            (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
           const struct scale_factors *const scale_fac =
               &cm->frame_refs[ref_frame - 1].sf;
           if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0;
@@ -509,7 +531,7 @@ static void set_rt_speed_feature_framesize_independent(
     }
 
     sf->disable_filter_search_var_thresh = 50;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->comp_inter_joint_search_iter_level = 2;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
@@ -621,7 +643,7 @@ static void set_rt_speed_feature_framesize_independent(
     // increase in encoding time.
     if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1;
     if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
-        cpi->oxcf.rc_mode == VPX_CBR) {
+        cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr) {
       if (cm->width * cm->height <= 352 * 288 && !cpi->use_svc &&
           cpi->oxcf.content != VP9E_CONTENT_SCREEN)
         sf->overshoot_detection_cbr_rt = RE_ENCODE_MAXQ;
@@ -633,7 +655,7 @@ static void set_rt_speed_feature_framesize_independent(
       sf->use_altref_onepass = 1;
       sf->use_compound_nonrd_pickmode = 1;
     }
-    if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 1;
+    if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2;
     if (!cpi->external_resize) sf->use_source_sad = 1;
   }
 
@@ -654,8 +676,10 @@ static void set_rt_speed_feature_framesize_independent(
       if (cpi->content_state_sb_fd == NULL &&
           (!cpi->use_svc ||
            svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
-        cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
-            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
+        CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd,
+                        (uint8_t *)vpx_calloc(
+                            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                            sizeof(uint8_t)));
       }
     }
     if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) {
@@ -668,7 +692,7 @@ static void set_rt_speed_feature_framesize_independent(
       sf->base_mv_aggressive = 1;
     }
     if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
-        cpi->oxcf.rc_mode == VPX_CBR)
+        cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr)
       sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ;
   }
 
@@ -721,7 +745,7 @@ static void set_rt_speed_feature_framesize_independent(
     if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
         svc->temporal_layer_id > 0)
       cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
-    if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 1;
+    if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2;
   }
 
   if (speed >= 8) {
@@ -765,7 +789,7 @@ static void set_rt_speed_feature_framesize_independent(
     }
     sf->limit_newmv_early_exit = 0;
     sf->use_simple_block_yrd = 1;
-    if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 1;
+    if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2;
   }
 
   if (speed >= 9) {
@@ -775,7 +799,7 @@ static void set_rt_speed_feature_framesize_independent(
       for (i = 0; i < BLOCK_SIZES; ++i)
         sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
     }
-    sf->cb_pred_filter_search = 1;
+    sf->cb_pred_filter_search = 2;
     sf->mv.enable_adaptive_subpel_force_stop = 1;
     sf->mv.adapt_subpel_force_stop.mv_thresh = 1;
     sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL;
@@ -806,14 +830,17 @@ static void set_rt_speed_feature_framesize_independent(
       sf->partition_search_type = FIXED_PARTITION;
       sf->always_this_block_size = BLOCK_64X64;
     }
-    if (cpi->count_arf_frame_usage == NULL)
-      cpi->count_arf_frame_usage =
+    if (cpi->count_arf_frame_usage == NULL) {
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->count_arf_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
-                                sizeof(*cpi->count_arf_frame_usage));
+                                sizeof(*cpi->count_arf_frame_usage)));
+    }
     if (cpi->count_lastgolden_frame_usage == NULL)
-      cpi->count_lastgolden_frame_usage =
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->count_lastgolden_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
-                                sizeof(*cpi->count_lastgolden_frame_usage));
+                                sizeof(*cpi->count_lastgolden_frame_usage)));
   }
   if (svc->previous_frame_is_intra_only) {
     sf->partition_search_type = FIXED_PARTITION;
@@ -832,6 +859,11 @@ static void set_rt_speed_feature_framesize_independent(
   // off for now.
   if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     cpi->oxcf.aq_mode = 0;
+  // For all speeds for rt mode: if the deadline mode changed (was good/best
+  // quality on previous frame and now is realtime) set nonrd_keyframe to 1 to
+  // avoid entering rd pickmode. This causes issues, such as: b/310663186.
+  if (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)
+    sf->nonrd_keyframe = 1;
 }
 
 void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) {
@@ -901,14 +933,17 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
-  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->mv.use_downsampled_sad = 0;
+  sf->comp_inter_joint_search_iter_level = 0;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
   sf->enhanced_full_pixel_motion_search = 1;
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
+  sf->prune_single_mode_based_on_mv_diff_mode_rate = 0;
   sf->cb_pred_filter_search = 0;
+  sf->early_term_interp_search_plane_rd = 0;
   sf->cb_partition_search = 0;
   sf->motion_field_mode_search = 0;
   sf->alt_ref_search_fp = 0;
@@ -931,15 +966,16 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->max_delta_qindex = 0;
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
-  sf->allow_partition_search_skip = 0;
   sf->allow_txfm_domain_distortion = 0;
   sf->tx_domain_thresh = 99.0;
-  sf->allow_quant_coeff_opt = sf->optimize_coefficients;
-  sf->quant_opt_thresh = 99.0;
+  sf->trellis_opt_tx_rd.method =
+      sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT;
+  sf->trellis_opt_tx_rd.thresh = 99.0;
   sf->allow_acl = 1;
   sf->enable_tpl_model = oxcf->enable_tpl_model;
   sf->prune_ref_frame_for_rect_partitions = 0;
   sf->temporal_filter_search_method = MESH;
+  sf->allow_skip_txfm_ac_dc = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -988,10 +1024,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->exhaustive_searches_thresh =
       (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
                                                               : INT_MAX;
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+  {
+    const int mesh_density_level =
+        (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1;
     for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
-      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+      sf->mesh_patterns[i].range =
+          best_quality_mesh_pattern[mesh_density_level][i].range;
+      sf->mesh_patterns[i].interval =
+          best_quality_mesh_pattern[mesh_density_level][i].interval;
     }
   }
 
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index ca284ded82..941de639ac 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -210,6 +210,10 @@ typedef struct MV_SPEED_FEATURES {
 
   // This variable sets the step_param used in full pel motion search.
   int fullpel_search_step_param;
+
+  // Whether to downsample the rows in sad calculation during motion search.
+  // This is only active when there are at least 8 rows.
+  int use_downsampled_sad;
 } MV_SPEED_FEATURES;
 
 typedef struct PARTITION_SEARCH_BREAKOUT_THR {
@@ -246,6 +250,24 @@ typedef enum {
   USE_8_TAPS_SHARP,
 } SUBPEL_SEARCH_TYPE;
 
+typedef enum {
+  // Disable trellis coefficient optimization
+  DISABLE_TRELLIS_OPT,
+  // Enable trellis coefficient optimization
+  ENABLE_TRELLIS_OPT,
+  // Enable trellis coefficient optimization based on source variance of the
+  // prediction block during transform RD
+  ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR,
+  // Enable trellis coefficient optimization based on residual mse of the
+  // transform block during transform RD
+  ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE,
+} ENABLE_TRELLIS_OPT_METHOD;
+
+typedef struct TRELLIS_OPT_CONTROL {
+  ENABLE_TRELLIS_OPT_METHOD method;
+  double thresh;
+} TRELLIS_OPT_CONTROL;
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -264,11 +286,20 @@ typedef struct SPEED_FEATURES {
   // adds overhead.
   int static_segmentation;
 
-  // If 1 we iterate finding a best reference for 2 ref frames together - via
-  // a log search that iterates 4 times (check around mv for last for best
-  // error of combined predictor then check around mv for alt). If 0 we
-  // we just use the best motion vector found for each frame by itself.
-  BLOCK_SIZE comp_inter_joint_search_thresh;
+  // The best compound predictor is found using an iterative log search process
+  // that searches for best ref0 mv using error of combined predictor and then
+  // searches for best ref1 mv. This sf determines the number of iterations of
+  // this process based on block size. The sf becomes more aggressive from level
+  // 0 to 2. The following table indicates the number of iterations w.r.t bsize:
+  //  -----------------------------------------------
+  // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 |
+  // |    0     |     4     |      4       |    4    |
+  // |    1     |     0     |      2       |    4    |
+  // |    2     |     0     |      0       |    0    |
+  //  -----------------------------------------------
+  // Here, 0 iterations indicate using the best single motion vector selected
+  // for each ref frame without any iterative refinement.
+  int comp_inter_joint_search_iter_level;
 
   // This variable is used to cap the maximum number of times we skip testing a
   // mode to be evaluated. A high value means we will be faster.
@@ -292,8 +323,8 @@ typedef struct SPEED_FEATURES {
   int coeff_prob_appx_step;
 
   // Enable uniform quantizer followed by trellis coefficient optimization
-  int allow_quant_coeff_opt;
-  double quant_opt_thresh;
+  // during transform RD
+  TRELLIS_OPT_CONTROL trellis_opt_tx_rd;
 
   // Enable asymptotic closed-loop encoding decision for key frame and
   // alternate reference frames.
@@ -399,9 +430,21 @@ typedef struct SPEED_FEATURES {
   // Adaptive prediction mode search
   int adaptive_mode_search;
 
-  // Chessboard pattern prediction filter type search
+  // Prune NEAREST and ZEROMV single reference modes based on motion vector
+  // difference and mode rate
+  int prune_single_mode_based_on_mv_diff_mode_rate;
+
+  // Chessboard pattern prediction for interp filter. Aggressiveness increases
+  // with levels.
+  // 0: disable
+  // 1: cb pattern in eval when filter is not switchable
+  // 2: cb pattern prediction for filter search
   int cb_pred_filter_search;
 
+  // This variable enables an early termination of interpolation filter eval
+  // based on the current rd cost after processing each plane
+  int early_term_interp_search_plane_rd;
+
   int cb_partition_search;
 
   int motion_field_mode_search;
@@ -525,9 +568,6 @@ typedef struct SPEED_FEATURES {
     int prune_rect_thresh[4];
   } rd_ml_partition;
 
-  // Allow skipping partition search for still image frame
-  int allow_partition_search_skip;
-
   // Fast approximation of vp9_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
 
@@ -603,7 +643,7 @@ typedef struct SPEED_FEATURES {
   // Use machine learning based partition search.
   int nonrd_use_ml_partition;
 
-  // Multiplier for base thresold for variance partitioning.
+  // Multiplier for base threshold for variance partitioning.
   int variance_part_thresh_mult;
 
   // Force subpel motion filter to always use SMOOTH_FILTER.
@@ -612,6 +652,12 @@ typedef struct SPEED_FEATURES {
   // For real-time mode: force DC only under intra search when content
   // does not have high souce SAD.
   int rt_intra_dc_only_low_content;
+
+  // The encoder has a feature that skips forward transform and quantization
+  // based on a model rd estimation to reduce encoding time.
+  // However, this feature is dangerous since it could lead to bad perceptual
+  // quality. This flag is added to guard the feature.
+  int allow_skip_txfm_ac_dc;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 19bbd5373f..3953253dbb 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -114,19 +114,20 @@ void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
   encode_term_subexp(w, delp);
 }
 
-int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
-                                        vpx_prob *bestp, vpx_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  int bestsavings = 0;
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                            vpx_prob oldp, vpx_prob *bestp,
+                                            vpx_prob upd) {
+  const int64_t old_b = cost_branch256(ct, oldp);
+  int64_t bestsavings = 0;
   vpx_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
   const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
 
   if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
     for (newp = *bestp; newp != oldp; newp += step) {
-      const int new_b = cost_branch256(ct, newp);
-      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
-      const int savings = old_b - new_b - update_b;
+      const int64_t new_b = cost_branch256(ct, newp);
+      const int64_t update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      const int64_t savings = old_b - new_b - update_b;
       if (savings > bestsavings) {
         bestsavings = savings;
         bestnewp = newp;
@@ -137,15 +138,15 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
   return bestsavings;
 }
 
-int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob oldp,
-                                              vpx_prob *bestp, vpx_prob upd,
-                                              int stepsize) {
-  int i, old_b, new_b, update_b, savings, bestsavings;
-  int newp;
-  const int step_sign = *bestp > oldp ? -1 : 1;
-  const int step = stepsize * step_sign;
-  const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                                  const vpx_prob oldp,
+                                                  vpx_prob *bestp, vpx_prob upd,
+                                                  int stepsize) {
+  int64_t i, old_b, new_b, update_b, savings, bestsavings;
+  int64_t newp;
+  const int64_t step_sign = *bestp > oldp ? -1 : 1;
+  const int64_t step = stepsize * step_sign;
+  const int64_t upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
   const vpx_prob *newplist, *oldplist;
   vpx_prob bestnewp;
   oldplist = vp9_pareto8_full[oldp - 1];
@@ -162,14 +163,14 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
     for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
       if (newp < 1 || newp > 255) continue;
       newplist = vp9_pareto8_full[newp - 1];
-      new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
+      new_b = cost_branch256(ct + 2 * PIVOT_NODE, (vpx_prob)newp);
       for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
         new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
-      update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      update_b = prob_diff_update_cost((vpx_prob)newp, oldp) + upd_cost;
       savings = old_b - new_b - update_b;
       if (savings > bestsavings) {
         bestsavings = savings;
-        bestnewp = newp;
+        bestnewp = (vpx_prob)newp;
       }
     }
   }
@@ -182,7 +183,7 @@ void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]) {
   const vpx_prob upd = DIFF_UPDATE_PROB;
   vpx_prob newp = get_binary_prob(ct[0], ct[1]);
-  const int savings =
+  const int64_t savings =
       vp9_prob_diff_update_savings_search(ct, *oldp, &newp, upd);
   assert(newp >= 1);
   if (savings > 0) {
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index f0d544b527..2d016d24c5 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -25,13 +25,14 @@ void vp9_write_prob_diff_update(struct vpx_writer *w, vpx_prob newp,
 void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]);
 
-int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
-                                        vpx_prob *bestp, vpx_prob upd);
-
-int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob oldp,
-                                              vpx_prob *bestp, vpx_prob upd,
-                                              int stepsize);
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                            vpx_prob oldp, vpx_prob *bestp,
+                                            vpx_prob upd);
+
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                                  const vpx_prob oldp,
+                                                  vpx_prob *bestp, vpx_prob upd,
+                                                  int stepsize);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index d85b3632cd..fff6d25de0 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -107,7 +107,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       RATE_CONTROL *const lrc = &lc->rc;
-      int i;
       lc->current_video_frame_in_layer = 0;
       lc->layer_size = 0;
       lc->frames_from_key_frame = 0;
@@ -164,17 +163,17 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
         lc->counter_encode_maxq_scene_change = 0;
-        CHECK_MEM_ERROR(cm, lc->map,
+        CHECK_MEM_ERROR(&cm->error, lc->map,
                         vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
         last_coded_q_map_size =
             mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
-        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+        CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map,
                         vpx_malloc(last_coded_q_map_size));
         assert(MAXQ <= 255);
         memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
         consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv);
-        CHECK_MEM_ERROR(cm, lc->consec_zero_mv,
+        CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv,
                         vpx_malloc(consec_zero_mv_size));
         memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
       }
@@ -220,18 +219,21 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
         RATE_CONTROL *const lrc = &lc->rc;
 
         lc->spatial_layer_target_bandwidth = spatial_layer_target;
-        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+        if (target_bandwidth != 0) {
+          bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+        }
         lrc->starting_buffer_level =
-            (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+            (int64_t)(rc->starting_buffer_level * bitrate_alloc + 0.5);
         lrc->optimal_buffer_level =
-            (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+            (int64_t)(rc->optimal_buffer_level * bitrate_alloc + 0.5);
         lrc->maximum_buffer_size =
-            (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+            (int64_t)(rc->maximum_buffer_size * bitrate_alloc + 0.5);
         lrc->bits_off_target =
             VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
         lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
         lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-        lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+        lrc->avg_frame_bandwidth =
+            (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
         lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
         lrc->worst_quality = rc->worst_quality;
         lrc->best_quality = rc->best_quality;
@@ -252,7 +254,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
 
       lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
 
-      bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      if (target_bandwidth != 0) {
+        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      }
       // Update buffer-related quantities.
       lrc->starting_buffer_level =
           (int64_t)(rc->starting_buffer_level * bitrate_alloc);
@@ -269,7 +273,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
       } else {
         lc->framerate = cpi->framerate;
       }
-      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+      lrc->avg_frame_bandwidth =
+          (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
       lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
       // Update qp-related quantities.
       lrc->worst_quality = rc->worst_quality;
@@ -290,7 +295,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
 }
 
 static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
-  if (is_one_pass_cbr_svc(cpi))
+  if (is_one_pass_svc(cpi))
     return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
                                        cpi->svc.number_temporal_layers +
                                    cpi->svc.temporal_layer_id];
@@ -311,7 +316,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
   const int tl = svc->temporal_layer_id;
 
   lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->avg_frame_bandwidth =
+      (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
   if (tl == 0) {
@@ -322,8 +328,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
     const int prev_layer_target_bandwidth =
         oxcf->layer_target_bitrate[st_idx - 1];
     lc->avg_frame_size =
-        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
-              (lc->framerate - prev_layer_framerate));
+        (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+                   (lc->framerate - prev_layer_framerate));
   }
 }
 
@@ -333,7 +339,8 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
   RATE_CONTROL *const lrc = &lc->rc;
 
   lc->framerate = framerate;
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->avg_frame_bandwidth =
+      (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
   lrc->min_frame_bandwidth =
       (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
   lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
@@ -354,9 +361,10 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   cpi->alt_ref_source = lc->alt_ref_source;
   // Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode
   // does not use speed = 0).
-  if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) {
+  if (is_one_pass_svc(cpi) && lc->speed > 0) {
     cpi->oxcf.speed = lc->speed;
   }
+  cpi->loopfilter_ctrl = lc->loopfilter_ctrl;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   if (cpi->svc.number_temporal_layers > 1 ||
@@ -388,6 +396,8 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
   lc->twopass = cpi->twopass;
   lc->target_bandwidth = (int)oxcf->target_bandwidth;
   lc->alt_ref_source = cpi->alt_ref_source;
+  lc->frame_qp = cpi->common.base_qindex;
+  lc->MBs = cpi->common.MBs;
 
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
@@ -407,6 +417,9 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
     lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
     lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
     lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
+    lc->qindex_delta[0] = cr->qindex_delta[0];
+    lc->qindex_delta[1] = cr->qindex_delta[1];
+    lc->qindex_delta[2] = cr->qindex_delta[2];
   }
 }
 
@@ -461,30 +474,27 @@ static void reset_fb_idx_unused(VP9_COMP *const cpi) {
   // fb_idx for that reference to the first one used/referenced.
   // This is to avoid setting fb_idx for a reference to a slot that is not
   // used/needed (i.e., since that reference is not referenced or refreshed).
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME first_ref = 0;
   int first_fb_idx = 0;
   int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
       first_ref = ref_frame;
       first_fb_idx = fb_idx[ref_frame - 1];
       break;
     }
   }
   if (first_ref > 0) {
-    if (first_ref != LAST_FRAME &&
-        !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) &&
+    if (first_ref != LAST_FRAME && !(cpi->ref_frame_flags & VP9_LAST_FLAG) &&
         !cpi->ext_refresh_last_frame)
       cpi->lst_fb_idx = first_fb_idx;
     else if (first_ref != GOLDEN_FRAME &&
-             !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+             !(cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
              !cpi->ext_refresh_golden_frame)
       cpi->gld_fb_idx = first_fb_idx;
     else if (first_ref != ALTREF_FRAME &&
-             !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) &&
+             !(cpi->ref_frame_flags & VP9_ALT_FLAG) &&
              !cpi->ext_refresh_alt_ref_frame)
       cpi->alt_fb_idx = first_fb_idx;
   }
@@ -729,8 +739,6 @@ static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config(
 
 void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int sl = svc->spatial_layer_id;
   svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
   svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
@@ -753,15 +761,12 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
   svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
 
-  svc->reference_last[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]);
-  svc->reference_golden[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]);
-  svc->reference_altref[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
+  svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG);
+  svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG);
+  svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
 }
 
-int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
+int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
   int width = 0, height = 0;
   SVC *const svc = &cpi->svc;
   LAYER_CONTEXT *lc = NULL;
@@ -797,9 +802,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
       for (sl = svc->number_spatial_layers - 1;
            sl >= svc->first_spatial_layer_to_encode; sl--) {
         int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id;
-        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-        cpi->rc = lc->rc;
-        cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+        LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer];
+        cpi->rc = sl_lc->rc;
+        cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth;
         if (vp9_test_drop(cpi)) {
           int sl2;
           // Set flag to force drop in encoding for this mode.
@@ -901,6 +906,10 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
     RATE_CONTROL *const lrc = &lc->rc;
     lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
     lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q);
+    if (cpi->fixed_qp_onepass) {
+      lrc->worst_quality = cpi->rc.worst_quality;
+      lrc->best_quality = cpi->rc.best_quality;
+    }
   }
 
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
@@ -955,7 +964,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
   if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
       !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame)
     svc->non_reference_frame = 1;
-  // For non-flexible mode, where update_buffer_slot is used, need to check if
+  // For flexible mode, where update_buffer_slot is used, need to check if
   // all buffer slots are not refreshed.
   if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
     if (svc->update_buffer_slot[svc->spatial_layer_id] != 0)
@@ -1044,17 +1053,17 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
   int sl, tl;
   for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
     // Check for reset based on avg_frame_bandwidth for spatial layer sl.
-    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
-                                 svc->number_temporal_layers);
-    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    const int spatial_layer_idx = LAYER_IDS_TO_IDX(
+        sl, svc->number_temporal_layers - 1, svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx];
     RATE_CONTROL *lrc = &lc->rc;
     if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
         lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
       // Reset for all temporal layers with spatial layer sl.
       for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
-        int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
-        LAYER_CONTEXT *lc = &svc->layer_context[layer];
-        RATE_CONTROL *lrc = &lc->rc;
+        int temporal_layer_idx =
+            LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        lrc = &svc->layer_context[temporal_layer_idx].rc;
         lrc->rc_1_frame = 0;
         lrc->rc_2_frame = 0;
         lrc->bits_off_target = lrc->optimal_buffer_level;
@@ -1079,15 +1088,14 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
       svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
       svc->drop_spatial_layer[sl - 1]) {
     MV_REFERENCE_FRAME ref_frame;
-    static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                      VP9_ALT_FLAG };
     for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
       const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-      if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+      if (yv12 != NULL &&
+          (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
         const struct scale_factors *const scale_fac =
             &cm->frame_refs[ref_frame - 1].sf;
         if (vp9_is_scaled(scale_fac)) {
-          cpi->ref_frame_flags &= (~flag_list[ref_frame]);
+          cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame));
           // Point golden/altref frame buffer index to last.
           if (!svc->simulcast_mode) {
             if (ref_frame == GOLDEN_FRAME)
@@ -1141,7 +1149,7 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
 void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   // For fixed/non-flexible mode, the following constraint are expected,
-  // when inter-layer prediciton is on (default).
+  // when inter-layer prediction is on (default).
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
       svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON &&
       svc->framedrop_mode != LAYER_DROP) {
@@ -1242,6 +1250,7 @@ void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) {
 
 void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
+  int i = 0;
   // Update the usage of frame buffer index for base spatial layers.
   if (svc->spatial_layer_id == 0) {
     if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame)
@@ -1250,6 +1259,11 @@ void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) {
       svc->fb_idx_base[cpi->gld_fb_idx] = 1;
     if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame)
       svc->fb_idx_base[cpi->alt_fb_idx] = 1;
+    // For bypass/flexible mode: check for refresh slots.
+    if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        if (svc->update_buffer_slot[0] & (1 << i)) svc->fb_idx_base[i] = 1;
+    }
   }
 }
 
@@ -1336,3 +1350,27 @@ void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) {
     }
   }
 }
+
+// SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
+// No need to set svc.skip_enhancement_layer if whole superframe will be
+// dropped.
+int vp9_svc_check_skip_enhancement_layer(VP9_COMP *const cpi) {
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+      cpi->oxcf.target_bandwidth == 0 &&
+      !(cpi->svc.framedrop_mode != LAYER_DROP &&
+        (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
+         cpi->svc
+             .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
+                                                1]) &&
+        cpi->svc.drop_spatial_layer[0])) {
+    cpi->svc.skip_enhancement_layer = 1;
+    vp9_rc_postencode_update_drop_frame(cpi);
+    cpi->ext_refresh_frame_flags_pending = 0;
+    cpi->last_frame_dropped = 1;
+    cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
+    cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
+    vp9_inc_frame_in_layer(cpi);
+    return 1;
+  }
+  return 0;
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index e7d9712aae..388a02789d 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -70,7 +70,11 @@ typedef struct {
   int actual_num_seg1_blocks;
   int actual_num_seg2_blocks;
   int counter_encode_maxq_scene_change;
+  int qindex_delta[3];
   uint8_t speed;
+  int loopfilter_ctrl;
+  int frame_qp;
+  int MBs;
 } LAYER_CONTEXT;
 
 typedef struct SVC {
@@ -172,6 +176,8 @@ typedef struct SVC {
   uint8_t fb_idx_temporal_layer_id[REF_FRAMES];
 
   int spatial_layer_sync[VPX_SS_MAX_LAYERS];
+  // Quantizer for each spatial layer.
+  int base_qindex[VPX_SS_MAX_LAYERS];
   uint8_t set_intra_only_frame;
   uint8_t previous_frame_is_intra_only;
   uint8_t superframe_has_layer_sync;
@@ -252,7 +258,7 @@ int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi);
 
 void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi);
 
-int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
+int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi);
 
 void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
@@ -275,6 +281,8 @@ void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi);
 void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi);
 
 void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi);
+
+int vp9_svc_check_skip_enhancement_layer(struct VP9_COMP *const cpi);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 701bb89287..986553a4a8 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -450,8 +450,6 @@ void vp9_highbd_apply_temporal_filter_c(
   // Apply the filter to luma
   for (row = 0; row < (int)block_height; row++) {
     for (col = 0; col < (int)block_width; col++) {
-      const int uv_row = row >> ss_y;
-      const int uv_col = col >> ss_x;
       const int filter_weight = get_filter_weight(
           row, col, block_height, block_width, blk_fw, use_32x32);
 
@@ -476,6 +474,8 @@ void vp9_highbd_apply_temporal_filter_c(
 
       // Sum the corresponding uv pixels to the current y modifier
       // Note we are rounding down instead of rounding to the nearest pixel.
+      uv_row = row >> ss_y;
+      uv_col = col >> ss_x;
       y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col];
       y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col];
 
@@ -777,16 +777,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior
           // is to weight all MBs equal.
-          blk_fw[0] = err < (thresh_low << THR_SHIFT)
-                          ? 2
-                          : err < (thresh_high << THR_SHIFT) ? 1 : 0;
+          blk_fw[0] = err < (thresh_low << THR_SHIFT)    ? 2
+                      : err < (thresh_high << THR_SHIFT) ? 1
+                                                         : 0;
           blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
         } else {
           use_32x32 = 0;
           for (k = 0; k < 4; k++)
-            blk_fw[k] = blk_bestsme[k] < thresh_low
-                            ? 2
-                            : blk_bestsme[k] < thresh_high ? 1 : 0;
+            blk_fw[k] = blk_bestsme[k] < thresh_low    ? 2
+                        : blk_bestsme[k] < thresh_high ? 1
+                                                       : 0;
         }
 
         for (k = 0; k < 4; k++) {
diff --git a/vp9/encoder/x86/temporal_filter_constants.h b/vp9/encoder/vp9_temporal_filter_constants.h
similarity index 98%
rename from vp9/encoder/x86/temporal_filter_constants.h
rename to vp9/encoder/vp9_temporal_filter_constants.h
index 7dcedda192..8776dfc068 100644
--- a/vp9/encoder/x86/temporal_filter_constants.h
+++ b/vp9/encoder/vp9_temporal_filter_constants.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-#define VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
+#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
 #include "./vpx_config.h"
 
 // Division using multiplication and shifting. The C implementation does:
@@ -407,4 +407,4 @@ static const uint32_t
 
 #define DIST_STRIDE ((BW) + 2)
 
-#endif  // VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#endif  // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 814d769be3..6c6c04493f 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -364,7 +364,7 @@ static void tokenize_b(int plane, int block, int row, int col,
   const PLANE_TYPE type = get_plane_type(plane);
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int16_t *scan, *nb;
-  const scan_order *so;
+  const ScanOrder *so;
   const int ref = is_inter_block(mi);
   unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[tx_size][type][ref];
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
new file mode 100644
index 0000000000..b8910370e0
--- /dev/null
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -0,0 +1,1541 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/common/vp9_mvref_common.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_tpl_model.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vpx_codec.h"
+
+static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                           const GF_GROUP *gf_group, int *tpl_group_frames) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame_idx = 0;
+  int i;
+  int gld_index = -1;
+  int alt_index = -1;
+  int lst_index = -1;
+  int arf_index_stack[MAX_ARF_LAYERS];
+  int arf_stack_size = 0;
+  int extend_frame_count = 0;
+  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+  int frame_gop_offset = 0;
+
+  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+  int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
+
+  memset(recon_frame_index, -1, sizeof(recon_frame_index));
+  stack_init(arf_index_stack, MAX_ARF_LAYERS);
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (frame_bufs[i].ref_count == 0) {
+      alloc_frame_mvs(cm, i);
+      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffer");
+
+      recon_frame_index[frame_idx] = i;
+      ++frame_idx;
+
+      if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
+    }
+  }
+
+  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
+    assert(recon_frame_index[i] >= 0);
+    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+  }
+
+  *tpl_group_frames = 0;
+
+  // Initialize Golden reference frame.
+  gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
+  gf_picture[0].update_type = gf_group->update_type[0];
+  gld_index = 0;
+  ++*tpl_group_frames;
+
+  // Initialize base layer ARF frame
+  gf_picture[1].frame = cpi->Source;
+  gf_picture[1].ref_frame[0] = gld_index;
+  gf_picture[1].ref_frame[1] = lst_index;
+  gf_picture[1].ref_frame[2] = alt_index;
+  gf_picture[1].update_type = gf_group->update_type[1];
+  alt_index = 1;
+  ++*tpl_group_frames;
+
+  // Initialize P frames
+  for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    struct lookahead_entry *buf;
+    frame_gop_offset = gf_group->frame_gop_index[frame_idx];
+    buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
+
+    switch (gf_group->update_type[frame_idx]) {
+      case ARF_UPDATE:
+        stack_push(arf_index_stack, alt_index, arf_stack_size);
+        ++arf_stack_size;
+        alt_index = frame_idx;
+        break;
+      case LF_UPDATE: lst_index = frame_idx; break;
+      case OVERLAY_UPDATE:
+        gld_index = frame_idx;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      case USE_BUF_FRAME:
+        lst_index = alt_index;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      default: break;
+    }
+
+    ++*tpl_group_frames;
+
+    // The length of group of pictures is baseline_gf_interval, plus the
+    // beginning golden frame from last GOP, plus the last overlay frame in
+    // the same GOP.
+    if (frame_idx == gf_group->gf_group_size) break;
+  }
+
+  alt_index = -1;
+  ++frame_idx;
+  ++frame_gop_offset;
+
+  // Extend two frames outside the current gf group.
+  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+    struct lookahead_entry *buf =
+        vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = LF_UPDATE;
+    lst_index = frame_idx;
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+    ++frame_gop_offset;
+  }
+
+  return extend_frame_count;
+}
+
+static void init_tpl_stats(VP9_COMP *cpi) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    memset(tpl_frame->tpl_stats_ptr, 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame->tpl_stats_ptr));
+    tpl_frame->is_valid = 0;
+  }
+}
+
+static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) {
+    vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list);
+  }
+  vpx_free(tpl_gop_stats->frame_stats_list);
+}
+
+static void init_tpl_stats_before_propagation(
+    struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats,
+    TplDepFrame *tpl_stats, int tpl_gop_frames, int frame_width,
+    int frame_height) {
+  int frame_idx;
+  free_tpl_frame_stats_list(tpl_gop_stats);
+  CHECK_MEM_ERROR(
+      error_info, tpl_gop_stats->frame_stats_list,
+      vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list)));
+  tpl_gop_stats->size = tpl_gop_frames;
+  for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) {
+    const int mi_rows = tpl_stats[frame_idx].height;
+    const int mi_cols = tpl_stats[frame_idx].width;
+    CHECK_MEM_ERROR(
+        error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list,
+        vpx_calloc(
+            mi_rows * mi_cols,
+            sizeof(
+                *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list)));
+    tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols;
+    tpl_gop_stats->frame_stats_list[frame_idx].frame_width = frame_width;
+    tpl_gop_stats->frame_stats_list[frame_idx].frame_height = frame_height;
+  }
+}
+
+#if CONFIG_NON_GREEDY_MV
+static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+                                         MotionField *motion_field,
+                                         int frame_idx, uint8_t *cur_frame_buf,
+                                         uint8_t *ref_frame_buf, int stride,
+                                         BLOCK_SIZE bsize, int mi_row,
+                                         int mi_col, MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  int step_param;
+  uint32_t bestsme = UINT_MAX;
+  const MvLimits tmp_mv_limits = x->mv_limits;
+  // lambda is used to adjust the importance of motion vector consistency.
+  // TODO(angiebird): Figure out lambda's proper value.
+  const int lambda = cpi->tpl_stats[frame_idx].lambda;
+  int_mv nb_full_mvs[NB_MVS_NUM];
+  int nb_full_mv_num;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  nb_full_mv_num =
+      vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
+  vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                             lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  return bestsme;
+}
+
+static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+                                        uint8_t *cur_frame_buf,
+                                        uint8_t *ref_frame_buf, int stride,
+                                        BLOCK_SIZE bsize, MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  uint32_t bestsme = UINT_MAX;
+  uint32_t distortion;
+  uint32_t sse;
+  int cost_list[5];
+
+  MV best_ref_mv1 = { 0, 0 };
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  // TODO(yunqing): may use higher tap interp filter than 2 taps.
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      USE_2_TAPS);
+
+  return bestsme;
+}
+
+#else  // CONFIG_NON_GREEDY_MV
+static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
+                                              uint8_t *cur_frame_buf,
+                                              uint8_t *ref_frame_buf,
+                                              int stride, BLOCK_SIZE bsize,
+                                              MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS search_method = NSTEP;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  uint32_t bestsme = UINT_MAX;
+  uint32_t distortion;
+  uint32_t sse;
+  int cost_list[5];
+  const MvLimits tmp_mv_limits = x->mv_limits;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                        search_method, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, mv, 0, 0);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  // TODO(yunqing): may use higher tap interp filter than 2 taps.
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      USE_2_TAPS);
+
+  return bestsme;
+}
+#endif
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
+  int width = 0, height = 0;
+  int bw = 4 << b_width_log2_lookup[bsize];
+  int bh = 4 << b_height_log2_lookup[bsize];
+
+  switch (block) {
+    case 0:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 1:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 2:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    case 3:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    default: assert(0);
+  }
+
+  return width * height;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+  int round;
+  if (ref_pos < 0)
+    round = -(1 + (-ref_pos - 1) / bsize_pix);
+  else
+    round = ref_pos / bsize_pix;
+
+  return round;
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, int stride) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
+      const int64_t mc_flow = tpl_ptr->mc_flow;
+      const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
+      *tpl_ptr = *src_stats;
+      tpl_ptr->mc_flow = mc_flow;
+      tpl_ptr->mc_ref_cost = mc_ref_cost;
+      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+    }
+  }
+}
+
+static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
+                                         TplDepStats *tpl_stats, int mi_row,
+                                         int mi_col, BLOCK_SIZE bsize,
+                                         int stride, int64_t recon_error,
+                                         int64_t rate_cost, int ref_frame_idx) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      VpxTplBlockStats *tpl_block_stats_ptr =
+          &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
+      tpl_block_stats_ptr->row = mi_row * 8;
+      tpl_block_stats_ptr->col = mi_col * 8;
+      tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
+      tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
+      tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
+      tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
+      tpl_block_stats_ptr->ref_frame_index = ref_frame_idx;
+    }
+  }
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+  MV mv = tpl_stats->mv.as_mv;
+  int mv_row = mv.row >> 3;
+  int mv_col = mv.col >> 3;
+
+  int ref_pos_row = mi_row * MI_SIZE + mv_row;
+  int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pix_num = bw * bh;
+
+  // top-left on grid block location in pixel
+  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+  int block;
+
+  for (block = 0; block < 4; ++block) {
+    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+      int overlap_area = get_overlap_area(
+          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+      int64_t mc_flow = tpl_stats->mc_dep_cost -
+                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+                            tpl_stats->intra_cost;
+
+      int idx, idy;
+
+      for (idy = 0; idy < mi_height; ++idy) {
+        for (idx = 0; idx < mi_width; ++idx) {
+          TplDepStats *des_stats =
+              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+                         (ref_mi_col + idx)];
+
+          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+          des_stats->mc_ref_cost +=
+              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
+              pix_num;
+          assert(overlap_area >= 0);
+        }
+      }
+    }
+  }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  int idx, idy;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr =
+          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+                         BLOCK_8X8);
+    }
+  }
+}
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               TX_SIZE tx_size, int64_t *recon_error,
+                               int64_t *sse, uint16_t *eob) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
+  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+  const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff,
+                                 pd->dequant, eob, scan_order);
+  } else {
+    vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+                          scan_order);
+  }
+#else
+  vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+                        scan_order);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  *recon_error = VPXMAX(*recon_error, 1);
+
+  *sse = (*sse) >> shift;
+  *sse = VPXMAX(*sse, 1);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                             TX_SIZE tx_size) {
+  // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
+  switch (tx_size) {
+    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                      TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+
+static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
+                          int mi_col) {
+  x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.row_max =
+      (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
+  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.col_max =
+      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+  const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT];
+  int rate_cost = 1;
+  int idx;
+  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+  for (idx = 0; idx < eob; ++idx) {
+    unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+    rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+  }
+
+  return (rate_cost << VP9_PROB_COST_SHIFT);
+}
+
+static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                            struct scale_factors *sf, GF_PICTURE *gf_picture,
+                            int frame_idx, TplDepFrame *tpl_frame,
+                            int16_t *src_diff, tran_low_t *coeff,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+                            int64_t *recon_error, int64_t *rate_cost,
+                            int64_t *sse, int *ref_frame_idx) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int pix_num = bw * bh;
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t best_inter_cost = INT64_MAX;
+  int64_t inter_cost;
+  int rf_idx;
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+
+  int64_t best_intra_cost = INT64_MAX;
+  int64_t intra_cost;
+  PREDICTION_MODE mode;
+  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  MODE_INFO mi_above, mi_left;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  TplDepStats *tpl_stats =
+      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+  xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
+  xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
+
+  // Intra prediction search
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    uint8_t *src, *dst;
+    int src_stride, dst_stride;
+
+    src = xd->cur_buf->y_buffer + mb_y_offset;
+    src_stride = xd->cur_buf->y_stride;
+
+    dst = &predictor[0];
+    dst_stride = bw;
+
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+    vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
+                            src_stride, dst, dst_stride, 0, 0, 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                                dst_stride, xd->bd);
+      vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                         dst_stride);
+      vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
+    vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    intra_cost = vpx_satd(coeff, pix_num);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+  }
+
+  // Motion compensated prediction
+  best_mv.as_int = 0;
+
+  set_mv_limits(cm, x, mi_row, mi_col);
+
+  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+    int_mv mv;
+#if CONFIG_NON_GREEDY_MV
+    MotionField *motion_field;
+#endif
+    if (ref_frame[rf_idx] == NULL) continue;
+
+#if CONFIG_NON_GREEDY_MV
+    (void)td;
+    motion_field = vp9_motion_field_info_get_motion_field(
+        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+    mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+#else
+    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                                  xd->cur_buf->y_stride, bsize, &mv.as_mv);
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_build_inter_predictor(
+          CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
+          ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
+          &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
+          mi_row * MI_SIZE, xd->bd);
+      vpx_highbd_subtract_block(
+          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+      vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vp9_build_inter_predictor(
+          ref_frame[rf_idx]->y_buffer + mb_y_offset,
+          ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
+          0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
+      vpx_subtract_block(bh, bw, src_diff, bw,
+                         xd->cur_buf->y_buffer + mb_y_offset,
+                         xd->cur_buf->y_stride, &predictor[0], bw);
+      vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                              ref_frame[rf_idx]->y_stride, &predictor[0], bw,
+                              &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+    vpx_subtract_block(bh, bw, src_diff, bw,
+                       xd->cur_buf->y_buffer + mb_y_offset,
+                       xd->cur_buf->y_stride, &predictor[0], bw);
+    vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    inter_cost = vpx_satd(coeff, pix_num);
+#endif
+
+    if (inter_cost < best_inter_cost) {
+      uint16_t eob = 0;
+      best_rf_idx = rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv.as_int = mv.as_int;
+      // Since best_inter_cost is initialized as INT64_MAX, recon_error and
+      // rate_cost will be calculated with the best reference frame.
+      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+                         sse, &eob);
+      *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+    }
+  }
+  best_intra_cost = VPXMAX(best_intra_cost, 1);
+  best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
+  tpl_stats->inter_cost = VPXMAX(
+      1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  tpl_stats->intra_cost = VPXMAX(
+      1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  tpl_stats->mv.as_int = best_mv.as_int;
+  *ref_frame_idx = best_rf_idx;
+}
+
+#if CONFIG_NON_GREEDY_MV
+static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
+                                  int frame_idx, int rf_idx, int mi_row,
+                                  int mi_col, struct buf_2d *src,
+                                  struct buf_2d *pre) {
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  YV12_BUFFER_CONFIG *ref_frame = NULL;
+  int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+  if (ref_frame_idx != -1) {
+    ref_frame = gf_picture[ref_frame_idx].frame;
+    src->buf = xd->cur_buf->y_buffer + mb_y_offset;
+    src->stride = xd->cur_buf->y_stride;
+    pre->buf = ref_frame->y_buffer + mb_y_offset;
+    pre->stride = ref_frame->y_stride;
+    assert(src->stride == pre->stride);
+    return 1;
+  } else {
+    printf("invalid ref_frame_idx");
+    assert(ref_frame_idx != -1);
+    return 0;
+  }
+}
+
+#define kMvPreCheckLines 5
+#define kMvPreCheckSize 15
+
+#define MV_REF_POS_NUM 3
+POSITION mv_ref_pos[MV_REF_POS_NUM] = {
+  { -1, 0 },
+  { 0, -1 },
+  { -1, -1 },
+};
+
+static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
+                             int mi_col) {
+  return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
+}
+
+static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
+                          BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  int i;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int_mv nearest_mv, near_mv, invalid_mv;
+  nearest_mv.as_int = INVALID_MV;
+  near_mv.as_int = INVALID_MV;
+  invalid_mv.as_int = INVALID_MV;
+  for (i = 0; i < MV_REF_POS_NUM; ++i) {
+    int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
+    int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
+    assert(mv_ref_pos[i].row <= 0);
+    assert(mv_ref_pos[i].col <= 0);
+    if (nb_row >= 0 && nb_col >= 0) {
+      if (nearest_mv.as_int == INVALID_MV) {
+        nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+      } else {
+        int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+        if (mv.as_int == nearest_mv.as_int) {
+          continue;
+        } else {
+          near_mv = mv;
+          break;
+        }
+      }
+    }
+  }
+  if (nearest_mv.as_int == INVALID_MV) {
+    nearest_mv.as_mv.row = 0;
+    nearest_mv.as_mv.col = 0;
+  }
+  if (near_mv.as_int == INVALID_MV) {
+    near_mv.as_mv.row = 0;
+    near_mv.as_mv.col = 0;
+  }
+  if (mv_mode == NEAREST_MV_MODE) {
+    return nearest_mv;
+  }
+  if (mv_mode == NEAR_MV_MODE) {
+    return near_mv;
+  }
+  assert(0);
+  return invalid_mv;
+}
+
+static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
+                                  MotionField *motion_field,
+                                  TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
+                                  int mi_row, int mi_col) {
+  int_mv mv;
+  switch (mv_mode) {
+    case ZERO_MV_MODE:
+      mv.as_mv.row = 0;
+      mv.as_mv.col = 0;
+      break;
+    case NEW_MV_MODE:
+      mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+      break;
+    case NEAREST_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    case NEAR_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    default:
+      mv.as_int = INVALID_MV;
+      assert(0);
+      break;
+  }
+  return mv;
+}
+
+static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
+                          GF_PICTURE *gf_picture, MotionField *motion_field,
+                          int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                          BLOCK_SIZE bsize, int mi_row, int mi_col,
+                          int_mv *mv) {
+  uint32_t sse;
+  struct buf_2d src;
+  struct buf_2d pre;
+  MV full_mv;
+  *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
+                            mi_row, mi_col);
+  full_mv = get_full_mv(&mv->as_mv);
+  if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
+                             &src, &pre)) {
+    // TODO(angiebird): Consider subpixel when computing the sse.
+    cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
+                          pre.stride, &sse);
+    return (double)(sse << VP9_DIST_SCALE_LOG2);
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+static int get_mv_mode_cost(int mv_mode) {
+  // TODO(angiebird): The probabilities are roughly inferred from
+  // default_inter_mode_probs. Check if there is a better way to set the
+  // probabilities.
+  const int zero_mv_prob = 16;
+  const int new_mv_prob = 24 * 1;
+  const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
+  assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
+  switch (mv_mode) {
+    case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
+    case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
+    case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    default: assert(0); return -1;
+  }
+}
+
+static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
+  double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
+                        log2(1 + abs(new_mv->col - ref_mv->col));
+  mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
+  return mv_diff_cost;
+}
+static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
+                          TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col) {
+  double mv_cost = get_mv_mode_cost(mv_mode);
+  if (mv_mode == NEW_MV_MODE) {
+    MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
+                                    bsize, mi_row, mi_col)
+                    .as_mv;
+    MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
+                                        tpl_frame, bsize, mi_row, mi_col)
+                        .as_mv;
+    MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
+                                     bsize, mi_row, mi_col)
+                     .as_mv;
+    double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
+    double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
+    mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
+  }
+  return mv_cost;
+}
+
+static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
+                           GF_PICTURE *gf_picture, MotionField *motion_field,
+                           int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                           BLOCK_SIZE bsize, int mi_row, int mi_col,
+                           int_mv *mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  double mv_dist =
+      get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
+                  tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
+  double mv_cost =
+      get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
+  double mult = 180;
+
+  return mv_cost + mult * log2f(1 + mv_dist);
+}
+
+static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                 GF_PICTURE *gf_picture,
+                                 MotionField *motion_field, int frame_idx,
+                                 TplDepFrame *tpl_frame, int rf_idx,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 double *rd, int_mv *mv) {
+  int best_mv_mode = ZERO_MV_MODE;
+  int update = 0;
+  int mv_mode;
+  *rd = 0;
+  for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
+    double this_rd;
+    int_mv this_mv;
+    if (mv_mode == NEW_MV_MODE) {
+      continue;
+    }
+    this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
+                           tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
+    if (update == 0) {
+      *rd = this_rd;
+      *mv = this_mv;
+      best_mv_mode = mv_mode;
+      update = 1;
+    } else {
+      if (this_rd < *rd) {
+        *rd = this_rd;
+        *mv = this_mv;
+        best_mv_mode = mv_mode;
+      }
+    }
+  }
+  return best_mv_mode;
+}
+
+static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            GF_PICTURE *gf_picture, MotionField *motion_field,
+                            int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                            BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int tmp_mv_mode_arr[kMvPreCheckSize];
+  int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
+  double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
+  int_mv *select_mv_arr = cpi->select_mv_arr;
+  int_mv tmp_select_mv_arr[kMvPreCheckSize];
+  int stride = tpl_frame->stride;
+  double new_mv_rd = 0;
+  double no_new_mv_rd = 0;
+  double this_new_mv_rd = 0;
+  double this_no_new_mv_rd = 0;
+  int idx;
+  int tmp_idx;
+  assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
+
+  // no new mv
+  // diagonal scan order
+  tmp_idx = 0;
+  for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+            bsize, nb_row, nb_col, &this_rd, mv);
+        if (r == 0 && c == 0) {
+          this_no_new_mv_rd = this_rd;
+        }
+        no_new_mv_rd += this_rd;
+        tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
+        tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
+        ++tmp_idx;
+      }
+    }
+  }
+
+  // new mv
+  mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
+  this_new_mv_rd = eval_mv_mode(
+      NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+      rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
+  new_mv_rd = this_new_mv_rd;
+  // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
+  // beforehand.
+  for (idx = 1; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+            bsize, nb_row, nb_col, &this_rd, mv);
+        new_mv_rd += this_rd;
+      }
+    }
+  }
+
+  // update best_mv_mode
+  tmp_idx = 0;
+  if (no_new_mv_rd < new_mv_rd) {
+    for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+      int r;
+      for (r = 0; r <= idx; ++r) {
+        int c = idx - r;
+        int nb_row = mi_row + r * mi_height;
+        int nb_col = mi_col + c * mi_width;
+        if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+          mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
+          select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
+          ++tmp_idx;
+        }
+      }
+    }
+    rd_diff_arr[mi_row * stride + mi_col] = 0;
+  } else {
+    rd_diff_arr[mi_row * stride + mi_col] =
+        (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
+  }
+}
+
+static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
+                                GF_PICTURE *gf_picture,
+                                MotionField *motion_field, int frame_idx,
+                                TplDepFrame *tpl_frame, int rf_idx,
+                                BLOCK_SIZE bsize) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int unit_rows = tpl_frame->mi_rows / mi_height;
+  const int unit_cols = tpl_frame->mi_cols / mi_width;
+  const int max_diagonal_lines = unit_rows + unit_cols - 1;
+  int idx;
+  for (idx = 0; idx < max_diagonal_lines; ++idx) {
+    int r;
+    for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
+         ++r) {
+      int c = idx - r;
+      int mi_row = r * mi_height;
+      int mi_col = c * mi_width;
+      assert(c >= 0 && c < unit_cols);
+      assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
+      assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
+      predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+                      rf_idx, bsize, mi_row, mi_col);
+    }
+  }
+}
+
+static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
+                             MotionField *motion_field, int frame_idx,
+                             YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
+                             int mi_row, int mi_col) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  assert(ref_frame != NULL);
+  set_mv_limits(cm, x, mi_row, mi_col);
+  {
+    int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+    uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
+    uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
+    const int stride = xd->cur_buf->y_stride;
+    full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
+                             ref_frame_buf, stride, bsize, mi_row, mi_col,
+                             &mv.as_mv);
+    sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
+                            bsize, &mv.as_mv);
+    vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
+  }
+}
+
+static void build_motion_field(
+    VP9_COMP *cpi, int frame_idx,
+    YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
+  int mi_row, mi_col;
+  int rf_idx;
+
+  tpl_frame->lambda = (pw * ph) >> 2;
+  assert(pw * ph == tpl_frame->lambda << 2);
+
+  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+    MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+    if (ref_frame[rf_idx] == NULL) {
+      continue;
+    }
+    vp9_motion_field_reset_mvs(motion_field);
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+        do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
+                         bsize, mi_row, mi_col);
+      }
+    }
+  }
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
+static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                              int frame_idx, BLOCK_SIZE bsize) {
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  VpxTplFrameStats *tpl_frame_stats_before_propagation =
+      &cpi->tpl_gop_stats.frame_stats_list[frame_idx];
+  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+  YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
+
+  VP9_COMMON *cm = &cpi->common;
+  struct scale_factors sf;
+  int rdmult, idx;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int mi_row, mi_col;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
+  uint8_t *predictor;
+#else
+  DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
+#endif
+  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+  tpl_frame_stats_before_propagation->frame_width = cm->width;
+  tpl_frame_stats_before_propagation->frame_height = cm->height;
+  // Setup scaling factor
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height,
+      cpi->common.use_highbitdepth);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  else
+    predictor = predictor8;
+#else
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Prepare reference frame pointers. If any reference frame slot is
+  // unavailable, the pointer will be set to Null.
+  for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
+    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+  xd->cur_buf = this_frame;
+
+  // Get rd multiplier set up.
+  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
+
+  tpl_frame->is_valid = 1;
+
+  cm->base_qindex = tpl_frame->base_qindex;
+  vp9_frame_init_quantizer(cpi);
+
+#if CONFIG_NON_GREEDY_MV
+  {
+    int square_block_idx;
+    int rf_idx;
+    for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+         ++square_block_idx) {
+      BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
+      build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
+    }
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+      if (ref_frame_idx != -1) {
+        MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+            &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+        predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
+                            tpl_frame, rf_idx, bsize);
+      }
+    }
+  }
+#endif  // CONFIG_NON_GREEDY_MV
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      int64_t recon_error = 0;
+      int64_t rate_cost = 0;
+      int64_t sse = 0;
+      // Ref frame index in the ref frame buffer.
+      int ref_frame_idx = -1;
+      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
+                      src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
+                      tx_size, ref_frame, predictor, &recon_error, &rate_cost,
+                      &sse, &ref_frame_idx);
+      // Motion flow dependency dispenser.
+      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+                      tpl_frame->stride);
+
+      tpl_store_before_propagation(
+          tpl_frame_stats_before_propagation->block_stats_list,
+          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride,
+          recon_error, rate_cost, ref_frame_idx);
+
+      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+                       bsize);
+    }
+  }
+}
+
+static void trim_tpl_stats(struct vpx_internal_error_info *error_info,
+                           VpxTplGopStats *tpl_gop_stats, int extra_frames) {
+  int i;
+  VpxTplFrameStats *new_frame_stats;
+  const int new_size = tpl_gop_stats->size - extra_frames;
+  if (tpl_gop_stats->size <= extra_frames)
+    vpx_internal_error(
+        error_info, VPX_CODEC_ERROR,
+        "The number of frames in VpxTplGopStats is fewer than expected.");
+  CHECK_MEM_ERROR(error_info, new_frame_stats,
+                  vpx_calloc(new_size, sizeof(*new_frame_stats)));
+  for (i = 0; i < new_size; i++) {
+    VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+    const int num_blocks = frame_stats->num_blocks;
+    new_frame_stats[i].num_blocks = frame_stats->num_blocks;
+    new_frame_stats[i].frame_width = frame_stats->frame_width;
+    new_frame_stats[i].frame_height = frame_stats->frame_height;
+    new_frame_stats[i].num_blocks = num_blocks;
+    CHECK_MEM_ERROR(
+        error_info, new_frame_stats[i].block_stats_list,
+        vpx_calloc(num_blocks, sizeof(*new_frame_stats[i].block_stats_list)));
+    memcpy(new_frame_stats[i].block_stats_list, frame_stats->block_stats_list,
+           num_blocks * sizeof(*new_frame_stats[i].block_stats_list));
+  }
+  free_tpl_frame_stats_list(tpl_gop_stats);
+  tpl_gop_stats->size = new_size;
+  tpl_gop_stats->frame_stats_list = new_frame_stats;
+}
+
+#if CONFIG_NON_GREEDY_MV
+#define DUMP_TPL_STATS 0
+#if DUMP_TPL_STATS
+static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
+  int i, j;
+  printf("%d %d\n", h, w);
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      printf("%d ", buf[(row + i) * stride + col + j]);
+    }
+  }
+  printf("\n");
+}
+
+static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
+  dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
+           frame_buf->y_width);
+  dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+  dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+}
+
+static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
+                           const GF_GROUP *gf_group,
+                           const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
+  int frame_idx;
+  const VP9_COMMON *cm = &cpi->common;
+  int rf_idx;
+  for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+      int mi_row, mi_col;
+      int ref_frame_idx;
+      const int mi_height = num_8x8_blocks_high_lookup[bsize];
+      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+      ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+      if (ref_frame_idx != -1) {
+        YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
+        const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
+        const int ref_gf_frame_offset =
+            gf_group->frame_gop_index[ref_frame_idx];
+        printf("=\n");
+        printf(
+            "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
+            "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
+            frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
+            ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
+                                                       frame_idx, rf_idx, bsize,
+                                                       mi_row, mi_col);
+              printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
+                     mv.as_mv.col);
+            }
+          }
+        }
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              const TplDepStats *tpl_ptr =
+                  &tpl_frame
+                       ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+              printf("%f ", tpl_ptr->feature_score);
+            }
+          }
+        }
+        printf("\n");
+
+        for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+          for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+            const int mv_mode =
+                tpl_frame
+                    ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
+            printf("%d ", mv_mode);
+          }
+        }
+        printf("\n");
+
+        dump_frame_buf(gf_picture[frame_idx].frame);
+        dump_frame_buf(ref_frame_buf);
+      }
+    }
+  }
+}
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame;
+
+  const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_NON_GREEDY_MV
+  int rf_idx;
+
+  vpx_free(cpi->select_mv_arr);
+  CHECK_MEM_ERROR(
+      &cm->error, cpi->select_mv_arr,
+      vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
+#endif
+
+  // TODO(jingning): Reduce the actual memory use for tpl model build up.
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+    if (cpi->tpl_stats[frame].width >= mi_cols &&
+        cpi->tpl_stats[frame].height >= mi_rows &&
+        cpi->tpl_stats[frame].tpl_stats_ptr)
+      continue;
+
+#if CONFIG_NON_GREEDY_MV
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr,
+                    vpx_calloc(mi_rows * mi_cols,
+                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
+    cpi->tpl_stats[frame].is_valid = 0;
+    cpi->tpl_stats[frame].width = mi_cols;
+    cpi->tpl_stats[frame].height = mi_rows;
+    cpi->tpl_stats[frame].stride = mi_cols;
+    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
+    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
+  }
+
+  for (frame = 0; frame < REF_FRAMES; ++frame) {
+    cpi->enc_frame_buf[frame].mem_valid = 0;
+    cpi->enc_frame_buf[frame].released = 1;
+  }
+}
+
+void vp9_free_tpl_buffer(VP9_COMP *cpi) {
+  int frame;
+#if CONFIG_NON_GREEDY_MV
+  vp9_free_motion_field_info(&cpi->motion_field_info);
+  vpx_free(cpi->select_mv_arr);
+#endif
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+#if CONFIG_NON_GREEDY_MV
+    int rf_idx;
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    cpi->tpl_stats[frame].is_valid = 0;
+  }
+  free_tpl_frame_stats_list(&cpi->tpl_gop_stats);
+}
+
+#if CONFIG_RATE_CTRL
+static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int show_frame_count = 0;
+  int frame_idx;
+  // Accumulate tpl stats for each frame in the current group of picture.
+  for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+    const int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t inter_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    int64_t mc_ref_cost_base = 0;
+    int64_t mc_flow_base = 0;
+    int row, col;
+
+    if (!tpl_frame->is_valid) continue;
+
+    for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
+      for (col = 0; col < cm->mi_cols; ++col) {
+        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+        intra_cost_base += this_stats->intra_cost;
+        inter_cost_base += this_stats->inter_cost;
+        mc_dep_cost_base += this_stats->mc_dep_cost;
+        mc_ref_cost_base += this_stats->mc_ref_cost;
+        mc_flow_base += this_stats->mc_flow;
+      }
+    }
+
+    cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base;
+    cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base;
+
+    ++show_frame_count;
+  }
+}
+#endif  // CONFIG_RATE_CTRL
+
+void vp9_setup_tpl_stats(VP9_COMP *cpi) {
+  GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int tpl_group_frames = 0;
+  int frame_idx;
+  int extended_frame_count;
+  cpi->tpl_bsize = BLOCK_32X32;
+
+  extended_frame_count =
+      init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
+
+  init_tpl_stats(cpi);
+
+  init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats,
+                                    cpi->tpl_stats, tpl_group_frames,
+                                    cpi->common.width, cpi->common.height);
+
+  // Backward propagation from tpl_group_frames to 1.
+  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
+    if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
+    mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
+  }
+
+  // TPL stats has extra frames from next GOP. Trim those extra frames for
+  // Qmode.
+  trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
+
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) {
+    const vpx_codec_err_t codec_status =
+        vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cpi->common.error, codec_status,
+                         "vp9_extrc_send_tpl_stats() failed");
+    }
+  }
+
+#if CONFIG_NON_GREEDY_MV
+  cpi->tpl_ready = 1;
+#if DUMP_TPL_STATS
+  dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+
+#if CONFIG_RATE_CTRL
+  if (cpi->oxcf.use_simple_encode_api) {
+    accumulate_frame_tpl_stats(cpi);
+  }
+#endif  // CONFIG_RATE_CTRL
+}
diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h
new file mode 100644
index 0000000000..04beb22610
--- /dev/null
+++ b/vp9/encoder/vp9_tpl_model.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log(x) / (float)M_LOG2_E)
+
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+typedef struct GF_PICTURE {
+  YV12_BUFFER_CONFIG *frame;
+  int ref_frame[3];
+  FRAME_UPDATE_TYPE update_type;
+} GF_PICTURE;
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi);
+void vp9_setup_tpl_stats(VP9_COMP *cpi);
+void vp9_free_tpl_buffer(VP9_COMP *cpi);
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                      TX_SIZE tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                             TX_SIZE tx_size);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
index 4fa24512c5..97f182c660 100644
--- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c
+++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/x86/temporal_filter_constants.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
 
 // Compute (a-b)**2 for 8 pixels with size 16-bit
 static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
@@ -141,11 +141,12 @@ static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
   count_u16 = _mm_adds_epu16(count_u16, sum_u16);
   _mm_storeu_si128((__m128i *)count, count_u16);
 
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
   pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
   pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
 
+  pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32);
+  pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32);
+
   accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
   accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
 
@@ -191,13 +192,11 @@ static INLINE void highbd_read_chroma_dist_row_8(
 }
 
 static void vp9_highbd_apply_temporal_filter_luma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
-    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_first,
     const uint32_t *const *neighbors_second, int top_weight,
     int bottom_weight) {
   const int rounding = (1 << strength) >> 1;
@@ -256,17 +255,12 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
   highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
                                 y_accum);
 
-  y_src += y_src_stride;
   y_pre += y_pre_stride;
   y_count += y_pre_stride;
   y_accum += y_pre_stride;
   y_dist += DIST_STRIDE;
 
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
 
   // Then all the rows except the last one
@@ -300,11 +294,7 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
       highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
                                     &v_first, &v_second);
 
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
       u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
       v_dist += DIST_STRIDE;
     }
 
@@ -320,7 +310,6 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
     highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
                                   y_accum);
 
-    y_src += y_src_stride;
     y_pre += y_pre_stride;
     y_count += y_pre_stride;
     y_accum += y_pre_stride;
@@ -364,13 +353,10 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
 
 // Perform temporal filter for the luma component.
 static void vp9_highbd_apply_temporal_filter_luma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
-    const uint32_t *u_dist, const uint32_t *v_dist) {
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
   unsigned int blk_col = 0, uv_blk_col = 0;
   const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
   const unsigned int mid_width = block_width >> 1,
@@ -384,9 +370,7 @@ static void vp9_highbd_apply_temporal_filter_luma(
   neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
   neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
   vp9_highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
       strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
       y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
       neighbors_first, neighbors_second, top_weight, bottom_weight);
@@ -399,13 +383,10 @@ static void vp9_highbd_apply_temporal_filter_luma(
   for (; blk_col < mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
   }
 
   if (!use_whole_blk) {
@@ -417,21 +398,16 @@ static void vp9_highbd_apply_temporal_filter_luma(
   for (; blk_col < last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
   }
 
   // Right
   neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
   vp9_highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
       strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
       y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
       neighbors_first, neighbors_second, top_weight, bottom_weight);
@@ -491,13 +467,11 @@ static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
 // blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_highbd_apply_temporal_filter_chroma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+    int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist,
     const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
     int top_weight, int bottom_weight, const int *blk_fw) {
   const int rounding = (1 << strength) >> 1;
@@ -565,10 +539,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
   highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
                                 v_accum);
 
-  u_src += uv_src_stride;
   u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
   v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
   u_count += uv_pre_stride;
@@ -576,8 +548,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
   v_count += uv_pre_stride;
   v_accum += uv_pre_stride;
 
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
   y_dist += DIST_STRIDE * (1 + ss_y);
 
   // Then all the rows except the last one
@@ -649,10 +619,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
     highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
                                   v_accum);
 
-    u_src += uv_src_stride;
     u_pre += uv_pre_stride;
     u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
     v_pre += uv_pre_stride;
     v_dist += DIST_STRIDE;
     u_count += uv_pre_stride;
@@ -660,8 +628,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
     v_count += uv_pre_stride;
     v_accum += uv_pre_stride;
 
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
     y_dist += DIST_STRIDE * (1 + ss_y);
   }
 
@@ -720,12 +686,10 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
 
 // Perform temporal filter for the chroma components.
 static void vp9_highbd_apply_temporal_filter_chroma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
     const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
   const unsigned int uv_width = block_width >> ss_x,
                      uv_height = block_height >> ss_y;
@@ -755,8 +719,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
 
     if (use_whole_blk) {
       vp9_highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
           uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
           u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -764,8 +726,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
           neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
     } else {
       vp9_highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
           uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
           u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -789,13 +749,11 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   }
 
   vp9_highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
 
   blk_col += blk_col_step;
   uv_blk_col += uv_blk_col_step;
@@ -812,8 +770,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
         uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
         u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -830,8 +786,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
         uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
         u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -849,13 +803,11 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   }
 
   vp9_highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
 }
 
 void vp9_highbd_apply_temporal_filter_sse4_1(
@@ -929,14 +881,12 @@ void vp9_highbd_apply_temporal_filter_sse4_1(
   u_dist_ptr = u_dist + 1;
   v_dist_ptr = v_dist + 1;
 
-  vp9_highbd_apply_temporal_filter_luma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw, use_whole_blk, y_accum, y_count, y_dist_ptr, u_dist_ptr,
-      v_dist_ptr);
+  vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+                                        block_height, ss_x, ss_y, strength,
+                                        blk_fw, use_whole_blk, y_accum, y_count,
+                                        y_dist_ptr, u_dist_ptr, v_dist_ptr);
 
   vp9_highbd_apply_temporal_filter_chroma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
       u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
       strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
       y_dist_ptr, u_dist_ptr, v_dist_ptr);
diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index 437f49f5a0..7571bfccac 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/x86/temporal_filter_constants.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
 
 // Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
 // difference squared, and store as unsigned 16-bit integer to dst.
@@ -270,13 +270,11 @@ static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
 // size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_apply_temporal_filter_luma_16(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
-    const uint16_t *v_dist, const int16_t *const *neighbors_first,
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors_first,
     const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
     const int *blk_fw) {
   const int rounding = (1 << strength) >> 1;
@@ -301,7 +299,6 @@ static void vp9_apply_temporal_filter_luma_16(
   assert(strength <= 6);
 
   assert(block_width == 16);
-
   (void)block_width;
 
   // Initialize the weights
@@ -342,17 +339,12 @@ static void vp9_apply_temporal_filter_luma_16(
   accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
                           y_accum);
 
-  y_src += y_src_stride;
   y_pre += y_pre_stride;
   y_count += y_pre_stride;
   y_accum += y_pre_stride;
   y_dist += DIST_STRIDE;
 
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
 
   // Then all the rows except the last one
@@ -392,11 +384,7 @@ static void vp9_apply_temporal_filter_luma_16(
       read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
                               &v_first, &v_second);
 
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
       u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
       v_dist += DIST_STRIDE;
     }
 
@@ -413,7 +401,6 @@ static void vp9_apply_temporal_filter_luma_16(
     accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
                             y_accum);
 
-    y_src += y_src_stride;
     y_pre += y_pre_stride;
     y_count += y_pre_stride;
     y_accum += y_pre_stride;
@@ -458,13 +445,10 @@ static void vp9_apply_temporal_filter_luma_16(
 
 // Perform temporal filter for the luma component.
 static void vp9_apply_temporal_filter_luma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
-    const uint16_t *u_dist, const uint16_t *v_dist) {
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
   unsigned int blk_col = 0, uv_blk_col = 0;
   const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
   const unsigned int mid_width = block_width >> 1,
@@ -476,27 +460,22 @@ static void vp9_apply_temporal_filter_luma(
 
   if (block_width == 16) {
     // Special Case: The blockwidth is 16 and we are operating on a row of 16
-    // chroma pixels. In this case, we can't use the usualy left-midle-right
+    // chroma pixels. In this case, we can't use the usual left-middle-right
     // pattern. We also don't support splitting now.
     neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
     neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
     if (use_whole_blk) {
       vp9_apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-          bottom_weight, NULL);
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, top_weight, bottom_weight, NULL);
     } else {
       vp9_apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, 0, 0, blk_fw);
     }
 
     return;
@@ -506,9 +485,7 @@ static void vp9_apply_temporal_filter_luma(
   neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
   neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
   vp9_apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
       use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
       neighbors_second, top_weight, bottom_weight, NULL);
@@ -521,13 +498,10 @@ static void vp9_apply_temporal_filter_luma(
   for (; blk_col < mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
   }
 
   if (!use_whole_blk) {
@@ -539,21 +513,16 @@ static void vp9_apply_temporal_filter_luma(
   for (; blk_col < last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
   }
 
   // Right
   neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
   vp9_apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
       use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
       neighbors_second, top_weight, bottom_weight, NULL);
@@ -564,10 +533,7 @@ static void vp9_apply_temporal_filter_luma(
 // blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_apply_temporal_filter_chroma_8(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
     unsigned int uv_block_height, int ss_x, int ss_y, int strength,
     uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
     const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
@@ -587,9 +553,7 @@ static void vp9_apply_temporal_filter_chroma_8(
   // Loop variable
   unsigned int h;
 
-  (void)uv_block_width;
-
-  // Initilize weight
+  // Initialize weight
   if (blk_fw) {
     weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0],
                             blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]);
@@ -621,10 +585,8 @@ static void vp9_apply_temporal_filter_chroma_8(
   accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
   accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
 
-  u_src += uv_src_stride;
   u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
   v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
   u_count += uv_pre_stride;
@@ -632,8 +594,6 @@ static void vp9_apply_temporal_filter_chroma_8(
   v_count += uv_pre_stride;
   v_accum += uv_pre_stride;
 
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
   y_dist += DIST_STRIDE * (1 + ss_y);
 
   // Then all the rows except the last one
@@ -676,10 +636,8 @@ static void vp9_apply_temporal_filter_chroma_8(
     accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
     accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
 
-    u_src += uv_src_stride;
     u_pre += uv_pre_stride;
     u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
     v_pre += uv_pre_stride;
     v_dist += DIST_STRIDE;
     u_count += uv_pre_stride;
@@ -687,8 +645,6 @@ static void vp9_apply_temporal_filter_chroma_8(
     v_count += uv_pre_stride;
     v_accum += uv_pre_stride;
 
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
     y_dist += DIST_STRIDE * (1 + ss_y);
   }
 
@@ -719,12 +675,10 @@ static void vp9_apply_temporal_filter_chroma_8(
 
 // Perform temporal filter for the chroma components.
 static void vp9_apply_temporal_filter_chroma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
     const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
   const unsigned int uv_width = block_width >> ss_x,
                      uv_height = block_height >> ss_y;
@@ -751,22 +705,17 @@ static void vp9_apply_temporal_filter_chroma(
 
     if (use_whole_blk) {
       vp9_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          top_weight, bottom_weight, NULL);
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+          bottom_weight, NULL);
     } else {
       vp9_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          0, 0, blk_fw);
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
     }
 
     return;
@@ -782,10 +731,8 @@ static void vp9_apply_temporal_filter_chroma(
   }
 
   vp9_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
       v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
       bottom_weight, NULL);
@@ -805,13 +752,11 @@ static void vp9_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
   }
 
   if (!use_whole_blk) {
@@ -823,13 +768,11 @@ static void vp9_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
   }
 
   // Right
@@ -842,10 +785,8 @@ static void vp9_apply_temporal_filter_chroma(
   }
 
   vp9_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
       v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
       bottom_weight, NULL);
@@ -886,12 +827,12 @@ void vp9_apply_temporal_filter_sse4_1(
   assert(
       (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
       "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
   assert(
       (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
       "subblock filter weight must be less than 2");
 
-  // Precompute the difference sqaured
+  // Precompute the difference squared
   for (row = 0; row < block_height; row++) {
     for (blk_col = 0; blk_col < block_width; blk_col += 16) {
       store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
@@ -922,14 +863,12 @@ void vp9_apply_temporal_filter_sse4_1(
   u_dist_ptr = u_dist + 1;
   v_dist_ptr = v_dist + 1;
 
-  vp9_apply_temporal_filter_luma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr,
-      u_dist_ptr, v_dist_ptr);
+  vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+                                 ss_x, ss_y, strength, blk_fw_ptr,
+                                 use_whole_blk, y_accum, y_count, y_dist_ptr,
+                                 u_dist_ptr, v_dist_ptr);
 
   vp9_apply_temporal_filter_chroma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
       u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
       strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
       y_dist_ptr, u_dist_ptr, v_dist_ptr);
diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 2188903b17..e9943447fd 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -111,7 +111,7 @@ static void fadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
   __m128i in7 = _mm_add_epi16(in[0], in[1]);
@@ -424,7 +424,7 @@ static void fadst8_sse2(__m128i *in) {
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__const_0 = _mm_setzero_si128();
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
@@ -1056,7 +1056,7 @@ static void fadst16_8col(__m128i *in) {
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
 
   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
deleted file mode 100644
index 4be6a5ea02..0000000000
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __GNUC__
-#define LIKELY(v) __builtin_expect(v, 1)
-#define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-#define LIKELY(v) (v)
-#define UNLIKELY(v) (v)
-#endif
-
-static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
-  int_mv result;
-  result.as_mv.row = row;
-  result.as_mv.col = col;
-  return result;
-}
-
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
-  // This is simplified from the C implementation to utilise that
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
-  return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
-         comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int sad_per_bit) {
-  const int_mv diff =
-      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
-/*****************************************************************************
- * This function utilizes 3 properties of the cost function lookup tables,   *
- * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
- * vp9_encoder.c.                                                            *
- * For the joint cost:                                                       *
- *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
- * For the component costs:                                                  *
- *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
- *         (Equal costs for both components)                                 *
- *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
- *         (Cost function is even)                                           *
- * If these do not hold, then this function cannot be used without           *
- * modification, in which case you can revert to using the C implementation, *
- * which does not rely on these properties.                                  *
- *****************************************************************************/
-int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
-                               const search_site_config *cfg, MV *ref_mv,
-                               MV *best_mv, int search_param, int sad_per_bit,
-                               int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
-                               const MV *center_mv) {
-  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
-  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
-  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
-  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
-
-  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
-
-  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
-  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
-
-  // search_param determines the length of the initial step and hence the number
-  // of iterations.
-  // 0 = initial step (MAX_FIRST_STEP) pel
-  // 1 = (MAX_FIRST_STEP/2) pel,
-  // 2 = (MAX_FIRST_STEP/4) pel...
-  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
-  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
-  const int tot_steps = cfg->total_steps - search_param;
-
-  const int_mv fcenter_mv =
-      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
-  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
-
-  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
-  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
-
-  int_mv bmv = pack_int_mv(ref_row, ref_col);
-  int_mv new_bmv = bmv;
-  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
-
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const uint8_t *const in_what =
-      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
-
-  // Work out the start point for the search
-  const uint8_t *best_address = in_what;
-  const uint8_t *new_best_address = best_address;
-#if VPX_ARCH_X86_64
-  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-  unsigned int best_sad;
-  int i, j, step;
-
-  // Check the prerequisite cost function properties that are easy to check
-  // in an assert. See the function-level documentation for details on all
-  // prerequisites.
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
-
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
-  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
-  *num00 = 0;
-
-  for (i = 0, step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
-      __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w;
-#if VPX_ARCH_X86_64
-      __m128i v_blocka[2];
-#else
-      __m128i v_blocka[1];
-#endif
-
-      // Compute the candidate motion vectors
-      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]);
-      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
-      // Clamp them to the search bounds
-      __m128i v_these_mv_clamp_w = v_these_mv_w;
-      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
-      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
-      // The ones that did not change are inside the search area
-      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
-
-      // If none of them are inside, then move on
-      if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
-        continue;
-      }
-
-      // The inverse mask indicates which of the MVs are outside
-      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff));
-      // Shift right to keep the sign bit clear, we will use this later
-      // to set the cost to the maximum value.
-      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
-
-      // Compute the difference MV
-      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
-      // We utilise the fact that the cost function is even, and use the
-      // absolute difference. This allows us to use unsigned indexes later
-      // and reduces cache pressure somewhat as only a half of the table
-      // is ever referenced.
-      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
-
-      // Compute the SIMD pointer offsets.
-      {
-#if VPX_ARCH_X86_64  //  sizeof(intptr_t) == 8
-        // Load the offsets
-        __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
-        __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
-        // Set the ones falling outside to zero
-        v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d));
-        v_bo32_q =
-            _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d));
-        // Compute the candidate addresses
-        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
-        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
-#else  // VPX_ARCH_X86 //  sizeof(intptr_t) == 4
-        __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]);
-        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
-        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
-#endif
-      }
-
-      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                     in_what_stride, (uint32_t *)&v_sad_d);
-
-      // Look up the component cost of the residual motion vector
-      {
-        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
-        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
-        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
-        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
-        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
-        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
-        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
-        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
-
-        // Note: This is a use case for vpgather in AVX2
-        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
-        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
-        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
-        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
-
-        __m128i v_cost_10_d, v_cost_32_d;
-        v_cost_10_d = _mm_cvtsi32_si128(cost0);
-        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
-        v_cost_32_d = _mm_cvtsi32_si128(cost2);
-        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
-        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
-      }
-
-      // Now add in the joint cost
-      {
-        const __m128i v_sel_d =
-            _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128());
-        const __m128i v_joint_cost_d =
-            _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d);
-        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
-      }
-
-      // Multiply by sad_per_bit
-      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
-      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
-      v_cost_d = _mm_add_epi32(v_cost_d,
-                               _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1)));
-      v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT);
-      // Add the cost to the sad
-      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
-
-      // Make the motion vectors outside the search area have max cost
-      // by or'ing in the comparison mask, this way the minimum search won't
-      // pick them.
-      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
-
-      // Find the minimum value and index horizontally in v_sad_d
-      {
-        // Try speculatively on 16 bits, so we can use the minpos intrinsic
-        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
-        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
-
-        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
-        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
-
-        // If the local best value is not saturated, just use it, otherwise
-        // find the horizontal minimum again the hard way on 32 bits.
-        // This is executed rarely.
-        if (UNLIKELY(local_best_sad == 0xffff)) {
-          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
-
-          v_loval_d = v_sad_d;
-          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
-          v_hival_d = _mm_srli_si128(v_loval_d, 8);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-          v_hival_d = _mm_srli_si128(v_loval_d, 4);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-
-          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
-          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
-        }
-
-        // Update the global minimum if the local minimum is smaller
-        if (LIKELY(local_best_sad < best_sad)) {
-          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
-          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
-
-          best_sad = local_best_sad;
-        }
-      }
-    }
-
-    bmv = new_bmv;
-    best_address = new_best_address;
-
-    v_bmv_w = _mm_set1_epi32(bmv.as_int);
-#if VPX_ARCH_X86_64
-    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-    if (UNLIKELY(best_address == in_what)) {
-      (*num00)++;
-    }
-  }
-
-  *best_mv = bmv.as_mv;
-  return best_sad;
-}
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 7685e7bc3e..94506aad0f 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -469,18 +469,18 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
   // It's used to choose the src offset and filter coefficient offset.
   const int offset_idx1 = (offset1_q4 >> 4) & 1;
   const int offset_idx2 = (offset2_q4 >> 4) & 1;
-  static const shuffle_filter_funcs shuffle_filter_funcs[2] = {
+  static const shuffle_filter_funcs kShuffleFilterFuncs[2] = {
     shuffle_filter_ssse3, shuffle_filter_odd_ssse3
   };
-  static const convolve8_funcs convolve8_funcs[2] = {
+  static const convolve8_funcs kConvolve8Funcs[2] = {
     convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
   };
 
   assert(w && h);
 
   shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
-  shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
-  shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+  kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+  kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
 
   // Sub 64 to avoid overflow.
   // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
@@ -522,11 +522,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
       // 04 14 24 34 44 54 64 74
       // 05 15 25 35 45 55 65 75
       d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
-      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
-      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
       d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
-      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
-      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
 
       // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
       // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
@@ -598,11 +598,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
       loadu_8bit_16x4(t, stride_hor, &s[4]);
 
       d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
-      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
-      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
       d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
-      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
-      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
 
       // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
       // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
@@ -754,8 +754,8 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
-  const int dst_uv_w = dst_w / 2;
-  const int dst_uv_h = dst_h / 2;
+  const int dst_uv_w = dst->uv_crop_width;
+  const int dst_uv_h = dst->uv_crop_height;
   int scaled = 0;
 
   // phase_scaler is usually 0 or 8.
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index 8dfdbd50f6..bf44b08674 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -16,9 +16,11 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 // Zero fill 8 positions in the output buffer.
-static INLINE void store_zero_tran_low(tran_low_t *a) {
+static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
   const __m256i zero = _mm256_setzero_si256();
 #if CONFIG_VP9_HIGHBITDEPTH
   _mm256_storeu_si256((__m256i *)(a), zero);
@@ -28,37 +30,85 @@ static INLINE void store_zero_tran_low(tran_low_t *a) {
 #endif
 }
 
-static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
-                                   __m256i *coeff256) {
-  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
-  const __m256i zero256 = _mm256_setzero_si256();
+static VPX_FORCE_INLINE void load_fp_values_avx2(
+    const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->round_fp));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  *quant = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->quant_fp));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
-  // B1 A1 B0 A0.  Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
-  const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
-  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
+  const __m256i v_iscan = _mm256_permute4x64_epi64(
+      _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
 #else
-  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
 #endif
-  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
-  // Add one to convert from indices to counts
-  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
-  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) {
+  const __m256i eob_lo = eob256;
+  // Copy upper 128 to lower 128
+  const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81);
+  __m256i eob = _mm256_max_epi16(eob_lo, eob_hi);
+  __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+  return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const int32_t nzflag =
+      _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr));
+
+  if (nzflag) {
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
 }
 
 void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
-  __m128i eob;
-  __m256i round256, quant256, dequant256;
-  __m256i eob256, thr256;
-
-  (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
+                          const struct macroblock_plane *const mb_plane,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const struct ScanOrder *const scan_order) {
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  const int16_t *iscan = scan_order->iscan;
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
@@ -66,74 +116,324 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
 
-  {
-    __m256i coeff256;
-
-    // Setup global values
-    {
-      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
-      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
-      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      round256 = _mm256_castsi128_si256(round);
-      round256 = _mm256_permute4x64_epi64(round256, 0x54);
-
-      quant256 = _mm256_castsi128_si256(quant);
-      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
-
-      dequant256 = _mm256_castsi128_si256(dequant);
-      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
-    }
-
-    {
-      __m256i qcoeff256;
-      __m256i qtmp256;
-      coeff256 = load_tran_low(coeff_ptr + n_coeffs);
-      qcoeff256 = _mm256_abs_epi16(coeff256);
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
-    }
-
-    eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256);
+  // Setup global values
+  load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
+  thr = _mm256_setzero_si256();
+
+  quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                 iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                 dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += 8 * 2;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_srai_epi16(dequant, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                   iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                   dqcoeff_ptr + n_coeffs, &eob_max);
     n_coeffs += 8 * 2;
   }
 
-  // remove dc constants
-  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
-  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
-  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+// Enable this flag when matching the optimized code to
+// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the
+// existing ssse3 code and quantize_fp_32x32_nz_c().
+//
+// #define MATCH_VP9_QUANTIZE_FP_32X32_C
+
+#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C
+static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+  const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+  const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+  const __m256i abs_dqcoeff =
+      _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+  const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+  const __m256i nz_mask =
+      _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+  store_tran_low(qcoeff, qcoeff_ptr);
+  store_tran_low(dqcoeff, dqcoeff_ptr);
+
+  *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  (void)thr;
+}
+#endif
+
+static VPX_FORCE_INLINE void quantize_fp_32x32_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  const int32_t nzflag = _mm256_movemask_epi8(thr_mask);
 
-  thr256 = _mm256_srai_epi16(dequant256, 1);
+  if (nzflag) {
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask);
+#else
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+#endif
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i abs_dqcoeff =
+        _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+    const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
+}
+
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *const mb_plane,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const struct ScanOrder *const scan_order) {
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  const int16_t *iscan = scan_order->iscan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
+  thr = _mm256_srli_epi16(dequant, 2);
+  quant = _mm256_slli_epi16(quant, 1);
+  {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)1);
+    round = _mm256_add_epi16(round, rnd);
+    round = _mm256_srai_epi16(round, 1);
+  }
+
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1));
+  quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+#else
+  quantize_fp_32x32_16_no_nzflag(
+      &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+      qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+#endif
+
+  n_coeffs += 8 * 2;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
 
   // AC only loop
   while (n_coeffs < 0) {
-    __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
-    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
-    int32_t nzflag =
-        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
-
-    if (nzflag) {
-      __m256i qtmp256;
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
-      eob256 = _mm256_max_epi16(
-          eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256));
-    } else {
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-    }
+    quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                         iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                         dqcoeff_ptr + n_coeffs, &eob_max);
     n_coeffs += 8 * 2;
   }
 
-  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
-                      _mm256_extracti128_si256(eob256, 1));
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
+  const __m128i v = _mm_load_si128((const __m128i *)val_ptr);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(v, zero);
+  const __m128i ac = _mm_unpackhi_epi16(v, zero);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void highbd_load_fp_values(
+    const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = highbd_init_256(mb_plane->round_fp);
+  *quant = highbd_init_256(mb_plane->quant_fp);
+  *dequant = highbd_init_256(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
+    const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
+  const __m256i packed_nz_mask =
+      _mm256_packs_epi32(nz_mask, _mm256_setzero_si256());
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const struct macroblock_plane *const mb_plane,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const struct ScanOrder *const scan_order) {
+  const int step = 8;
+  __m256i round, quant, dequant;
+  __m256i eob_max = _mm256_setzero_si256();
+  const int16_t *iscan = scan_order->iscan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
+
+  highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                     iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                     dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr);
+  const __m256i tmp_rnd =
+      _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq =
+      _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
+  const int step = 8;
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  const int16_t *iscan = scan_order->iscan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
+  thr = _mm256_srli_epi32(dequant, 2);
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1));
+  quant = _mm256_slli_epi32(quant, 1);
+  round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1);
+
+  highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                           iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                           dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp_32x32(
+        &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+        qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
 
-  *eob_ptr = accumulate_eob(eob);
+  *eob_ptr = get_max_eob(eob_max);
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index e3d803b8f0..2481eb366e 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -16,187 +16,111 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
-  __m128i zero;
+                          const struct macroblock_plane *const mb_plane,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
-  __m128i eob;
+  int index = 16;
   __m128i round, quant, dequant;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+  const int16_t *iscan = scan_order->iscan;
 
-  (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
-
-  coeff_ptr += n_coeffs;
-  iscan += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  {
-    __m128i coeff0, coeff1;
-
-    // Setup global values
-    {
-      round = _mm_load_si128((const __m128i *)round_ptr);
-      quant = _mm_load_si128((const __m128i *)quant_ptr);
-      dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-    }
+  // Setup global values.
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
 
-    {
-      __m128i coeff0_sign, coeff1_sign;
-      __m128i qcoeff0, qcoeff1;
-      __m128i qtmp0, qtmp1;
-      // Do DC and first 15 AC
-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-      // Poor man's sign extract
-      coeff0_sign = _mm_srai_epi16(coeff0, 15);
-      coeff1_sign = _mm_srai_epi16(coeff1, 15);
-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
 
-      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-      round = _mm_unpackhi_epi64(round, round);
-      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-      quant = _mm_unpackhi_epi64(quant, quant);
-      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-      // Reinsert signs
-      qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
 
-      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
 
-      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-      dequant = _mm_unpackhi_epi64(dequant, dequant);
-      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-    }
+  // Reinsert signs.
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-    {
-      // Scan for eob
-      __m128i zero_coeff0, zero_coeff1;
-      __m128i nzero_coeff0, nzero_coeff1;
-      __m128i iscan0, iscan1;
-      __m128i eob1;
-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      // Add one to convert from indices to counts
-      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-      eob = _mm_and_si128(iscan0, nzero_coeff0);
-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-      eob = _mm_max_epi16(eob, eob1);
-    }
-    n_coeffs += 8 * 2;
-  }
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   thr = _mm_srai_epi16(dequant, 1);
 
-  // AC only loop
-  while (n_coeffs < 0) {
-    __m128i coeff0, coeff1;
-    {
-      __m128i coeff0_sign, coeff1_sign;
-      __m128i qcoeff0, qcoeff1;
-      __m128i qtmp0, qtmp1;
-
-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-      // Poor man's sign extract
-      coeff0_sign = _mm_srai_epi16(coeff0, 15);
-      coeff1_sign = _mm_srai_epi16(coeff1, 15);
-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-      nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-               _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-      if (nzflag) {
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      } else {
-        store_zero_tran_low(qcoeff_ptr + n_coeffs);
-        store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
-        store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-        store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-      }
-    }
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    // Poor man's abs().
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
 
     if (nzflag) {
-      // Scan for eob
-      __m128i zero_coeff0, zero_coeff1;
-      __m128i nzero_coeff0, nzero_coeff1;
-      __m128i iscan0, iscan1;
-      __m128i eob0, eob1;
-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      // Add one to convert from indices to counts
-      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-      eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-      eob0 = _mm_max_epi16(eob0, eob1);
+      __m128i eob0;
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+      qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
       eob = _mm_max_epi16(eob, eob0);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
     }
-    n_coeffs += 8 * 2;
-  }
 
-  // Accumulate EOB
-  {
-    __m128i eob_shuffled;
-    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    *eob_ptr = _mm_extract_epi16(eob, 1);
+    index += 16;
   }
+
+  *eob_ptr = accumulate_eob(eob);
 }
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c
new file mode 100644
index 0000000000..98decae749
--- /dev/null
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -0,0 +1,252 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const struct macroblock_plane *const mb_plane,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                           const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+  const int16_t *iscan = scan_order->iscan;
+
+  // Setup global values.
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      __m128i eob0;
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const struct macroblock_plane *const mb_plane,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one_s16 = _mm_set1_epi16(1);
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+  const int16_t *iscan = scan_order->iscan;
+
+  // Setup global values.
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
+  // The 32x32 halves round.
+  round = _mm_add_epi16(round, one_s16);
+  round = _mm_srli_epi16(round, 1);
+
+  // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so
+  // upshift quant to account for this.
+  quant = _mm_slli_epi16(quant, 1);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  // Get the abs value of qcoeff again so we can use shifts for division.
+  qcoeff0 = _mm_abs_epi16(qcoeff0);
+  qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  // Divide by 2.
+  qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+  qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 2);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      // Get the abs value of qcoeff again so we can use shifts for division.
+      qcoeff0 = _mm_abs_epi16(qcoeff0);
+      qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      // Divide by 2.
+      qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+      qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    if (nzflag) {
+      const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    }
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
deleted file mode 100644
index 5703aa3bb6..0000000000
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,178 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
-                                qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, fp_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m1, m5
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  mov                             r3, qcoeffmp
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, fp_32x32
-  psllw                           m2, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq
-  lea                            r5q, [r5q+ncoeffq*2]
-  INCREMENT_ELEMENTS_TRAN_LOW    r3q, ncoeffq
-  INCREMENT_ELEMENTS_TRAN_LOW    r4q, ncoeffq
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
-  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpeqw                         m7, m7
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  STORE_TRAN_LOW  8, r3q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
-%ifidn %1, fp_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-  psrlw                           m0, m3, 2
-%else
-  psrlw                           m0, m3, 1
-%endif
-  STORE_TRAN_LOW  8, r4q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m7                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
-  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-
-  pcmpgtw                         m7, m6,  m0
-  pcmpgtw                        m12, m11, m0
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-
-  or                              r6, r2
-  jz .skip_iter
-
-  pcmpeqw                         m7, m7
-
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  STORE_TRAN_LOW 14, r3q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
-%ifidn %1, fp_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
-  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  STORE_TRAN_LOW 14, r4q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m7                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-  jmp .accumulate_eob
-.skip_iter:
-  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq
-  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8
-  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq
-  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                           [r2], r6w
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FP fp, 7
-QUANTIZE_FP fp_32x32, 7
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 6238b3a955..fd81bce7b5 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -11,6 +11,7 @@
 
 #include <new>
 
+#include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vpx/vp8cx.h"
@@ -24,27 +25,65 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
                                                 VP9RateControlRTC());
   if (!rc_api) return nullptr;
   rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
-  if (rc_api->cpi_ == nullptr) {
-    return nullptr;
+  if (!rc_api->cpi_) return nullptr;
+  vp9_zero(*rc_api->cpi_);
+
+  if (!rc_api->InitRateControl(cfg)) return nullptr;
+  if (cfg.aq_mode) {
+    VP9_COMP *const cpi = rc_api->cpi_;
+    cpi->segmentation_map = static_cast<uint8_t *>(
+        vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
+                   sizeof(*cpi->segmentation_map)));
+    if (!cpi->segmentation_map) return nullptr;
+    cpi->cyclic_refresh =
+        vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols);
+    cpi->cyclic_refresh->content_mode = 0;
   }
-  rc_api->InitRateControl(cfg);
   return rc_api;
 }
 
-void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
+VP9RateControlRTC::~VP9RateControlRTC() {
+  if (cpi_) {
+    if (cpi_->svc.number_spatial_layers > 1 ||
+        cpi_->svc.number_temporal_layers > 1) {
+      for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+        for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+          int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
+          LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+          vpx_free(lc->map);
+          vpx_free(lc->last_coded_q_map);
+          vpx_free(lc->consec_zero_mv);
+        }
+      }
+    }
+    if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vpx_free(cpi_->segmentation_map);
+      cpi_->segmentation_map = NULL;
+      vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
+    }
+    vpx_free(cpi_);
+  }
+}
+
+bool VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   VP9_COMMON *cm = &cpi_->common;
   VP9EncoderConfig *oxcf = &cpi_->oxcf;
   RATE_CONTROL *const rc = &cpi_->rc;
   cm->profile = PROFILE_0;
   cm->bit_depth = VPX_BITS_8;
   cm->show_frame = 1;
-  oxcf->rc_mode = VPX_CBR;
+  oxcf->profile = cm->profile;
+  oxcf->bit_depth = cm->bit_depth;
+  oxcf->rc_mode = rc_cfg.rc_mode;
   oxcf->pass = 0;
-  oxcf->aq_mode = NO_AQ;
+  oxcf->aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ;
   oxcf->content = VP9E_CONTENT_DEFAULT;
   oxcf->drop_frames_water_mark = 0;
+  cm->current_video_frame = 0;
+  rc->kf_boost = DEFAULT_KF_BOOST;
 
-  UpdateRateControl(rc_cfg);
+  if (!UpdateRateControl(rc_cfg)) return false;
+  vp9_set_mb_mi(cm, cm->width, cm->height);
 
   cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 ||
                    cpi_->svc.number_temporal_layers > 1)
@@ -55,12 +94,23 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   rc->rc_2_frame = 0;
   vp9_rc_init_minq_luts();
   vp9_rc_init(oxcf, 0, rc);
+  rc->constrain_gf_key_freq_onepass_vbr = 0;
   cpi_->sf.use_nonrd_pick_mode = 1;
-  cm->current_video_frame = 0;
+  return true;
 }
 
-void VP9RateControlRTC::UpdateRateControl(
+bool VP9RateControlRTC::UpdateRateControl(
     const VP9RateControlRtcConfig &rc_cfg) {
+  // Since VPX_MAX_LAYERS (12) is less than the product of VPX_SS_MAX_LAYERS (5)
+  // and VPX_TS_MAX_LAYERS (5), check all three.
+  if (rc_cfg.ss_number_layers < 1 ||
+      rc_cfg.ss_number_layers > VPX_SS_MAX_LAYERS ||
+      rc_cfg.ts_number_layers < 1 ||
+      rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS ||
+      rc_cfg.ss_number_layers * rc_cfg.ts_number_layers > VPX_MAX_LAYERS) {
+    return false;
+  }
+
   VP9_COMMON *cm = &cpi_->common;
   VP9EncoderConfig *oxcf = &cpi_->oxcf;
   RATE_CONTROL *const rc = &cpi_->rc;
@@ -73,22 +123,38 @@ void VP9RateControlRTC::UpdateRateControl(
   oxcf->best_allowed_q = vp9_quantizer_to_qindex(rc_cfg.min_quantizer);
   rc->worst_quality = oxcf->worst_allowed_q;
   rc->best_quality = oxcf->best_allowed_q;
+  oxcf->init_framerate = rc_cfg.framerate;
   oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth;
   oxcf->starting_buffer_level_ms = rc_cfg.buf_initial_sz;
   oxcf->optimal_buffer_level_ms = rc_cfg.buf_optimal_sz;
   oxcf->maximum_buffer_size_ms = rc_cfg.buf_sz;
   oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
   oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  oxcf->content = rc_cfg.is_screen ? VP9E_CONTENT_SCREEN : VP9E_CONTENT_DEFAULT;
   oxcf->ss_number_layers = rc_cfg.ss_number_layers;
   oxcf->ts_number_layers = rc_cfg.ts_number_layers;
   oxcf->temporal_layering_mode = (VP9E_TEMPORAL_LAYERING_MODE)(
       (rc_cfg.ts_number_layers > 1) ? rc_cfg.ts_number_layers : 0);
 
   cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  cpi_->oxcf.rc_max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
   cpi_->framerate = rc_cfg.framerate;
   cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
   cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
 
+  vp9_set_mb_mi(cm, cm->width, cm->height);
+
+  if (setjmp(cpi_->common.error.jmp)) {
+    cpi_->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    return false;
+  }
+  cpi_->common.error.setjmp = 1;
+
+  for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
+    oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
+  }
   for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
     for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
       const int layer =
@@ -102,21 +168,33 @@ void VP9RateControlRTC::UpdateRateControl(
       lrc->best_quality = vp9_quantizer_to_qindex(rc_cfg.min_quantizers[layer]);
       lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl];
       lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl];
-      oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
     }
   }
   vp9_set_rc_buffer_sizes(cpi_);
   vp9_new_framerate(cpi_, cpi_->framerate);
   if (cpi_->svc.number_temporal_layers > 1 ||
       cpi_->svc.number_spatial_layers > 1) {
-    if (cm->current_video_frame == 0) vp9_init_layer_context(cpi_);
+    if (cm->current_video_frame == 0) {
+      vp9_init_layer_context(cpi_);
+      // svc->framedrop_mode is not currently exposed, so only allow for
+      // full superframe drop for now.
+      cpi_->svc.framedrop_mode = FULL_SUPERFRAME_DROP;
+    }
     vp9_update_layer_context_change_config(cpi_,
                                            (int)cpi_->oxcf.target_bandwidth);
+    cpi_->svc.max_consec_drop = rc_cfg.max_consec_drop;
   }
   vp9_check_reset_rc_flag(cpi_);
+
+  cpi_->common.error.setjmp = 0;
+  return true;
 }
 
-void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
+// Compute the QP for the frame. If the frame is dropped this function
+// returns kDrop, and no QP is computed. If the frame is encoded (not dropped)
+// the QP is computed and kOk is returned.
+FrameDropDecision VP9RateControlRTC::ComputeQP(
+    const VP9FrameParamsQpRTC &frame_params) {
   VP9_COMMON *const cm = &cpi_->common;
   int width, height;
   cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
@@ -133,16 +211,34 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
     cm->height = height;
   }
   vp9_set_mb_mi(cm, cm->width, cm->height);
-  cm->frame_type = frame_params.frame_type;
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
+  // This is needed to ensure key frame does not get unset in rc_get_svc_params.
+  cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
   cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
   cpi_->sf.use_nonrd_pick_mode = 1;
   if (cpi_->svc.number_spatial_layers == 1 &&
       cpi_->svc.number_temporal_layers == 1) {
-    int target;
-    if (frame_is_intra_only(cm))
-      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_);
-    else
-      target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_);
+    int target = 0;
+    if (cpi_->oxcf.rc_mode == VPX_CBR) {
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+        vp9_cyclic_refresh_update_parameters(cpi_);
+      if (frame_is_intra_only(cm))
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_);
+      else
+        target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_);
+    } else if (cpi_->oxcf.rc_mode == VPX_VBR) {
+      if (cm->frame_type == KEY_FRAME) {
+        cpi_->rc.this_key_frame_forced = cm->current_video_frame != 0;
+        cpi_->rc.frames_to_key = cpi_->oxcf.key_freq;
+      }
+      vp9_set_gf_update_one_pass_vbr(cpi_);
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+        vp9_cyclic_refresh_update_parameters(cpi_);
+      if (frame_is_intra_only(cm))
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi_);
+      else
+        target = vp9_calc_pframe_target_size_one_pass_vbr(cpi_);
+    }
     vp9_rc_set_frame_target(cpi_, target);
     vp9_update_buffer_level_preencode(cpi_);
   } else {
@@ -150,20 +246,98 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
     vp9_restore_layer_context(cpi_);
     vp9_rc_get_svc_params(cpi_);
   }
+  if (cpi_->svc.spatial_layer_id == 0) vp9_zero(cpi_->svc.drop_spatial_layer);
+  // SVC: check for skip encoding of enhancement layer if the
+  // layer target bandwidth = 0.
+  if (vp9_svc_check_skip_enhancement_layer(cpi_))
+    return FrameDropDecision::kDrop;
+  // Check for dropping this frame based on buffer level.
+  // Never drop on key frame, or if base layer is key for svc,
+  if (!frame_is_intra_only(cm) &&
+      (!cpi_->use_svc ||
+       !cpi_->svc.layer_context[cpi_->svc.temporal_layer_id].is_key_frame)) {
+    if (vp9_rc_drop_frame(cpi_)) {
+      // For FULL_SUPERFRAME_DROP mode (the only mode considered here):
+      // if the superframe drop is decided we need to save the layer context for
+      // all spatial layers, and call update_buffer_level and postencode_drop
+      // for all spatial layers.
+      if (cpi_->svc.number_spatial_layers > 1 ||
+          cpi_->svc.number_temporal_layers > 1) {
+        vp9_save_layer_context(cpi_);
+        for (int sl = 1; sl < cpi_->svc.number_spatial_layers; sl++) {
+          cpi_->svc.spatial_layer_id = sl;
+          vp9_restore_layer_context(cpi_);
+          vp9_update_buffer_level_svc_preencode(cpi_);
+          vp9_rc_postencode_update_drop_frame(cpi_);
+          vp9_save_layer_context(cpi_);
+        }
+      }
+      return FrameDropDecision::kDrop;
+    }
+  }
+  // Compute the QP for the frame.
   int bottom_index, top_index;
   cpi_->common.base_qindex =
       vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index);
+
+  if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_);
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1)
+    vp9_save_layer_context(cpi_);
+
+  cpi_->last_frame_dropped = 0;
+  cpi_->svc.last_layer_dropped[cpi_->svc.spatial_layer_id] = 0;
+  if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
+    cpi_->svc.num_encoded_top_layer++;
+
+  return FrameDropDecision::kOk;
 }
 
 int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; }
 
 int VP9RateControlRTC::GetLoopfilterLevel() const {
   struct loopfilter *const lf = &cpi_->common.lf;
-  vp9_pick_filter_level(NULL, cpi_, LPF_PICK_FROM_Q);
+  vp9_pick_filter_level(nullptr, cpi_, LPF_PICK_FROM_Q);
   return lf->filter_level;
 }
 
-void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+bool VP9RateControlRTC::GetSegmentationData(
+    VP9SegmentationData *segmentation_data) const {
+  if (!cpi_->cyclic_refresh->apply_cyclic_refresh) return false;
+
+  segmentation_data->segmentation_map = cpi_->segmentation_map;
+  segmentation_data->segmentation_map_size =
+      cpi_->common.mi_cols * cpi_->common.mi_rows;
+  segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta;
+  segmentation_data->delta_q_size = 3u;
+  return true;
+}
+
+void VP9RateControlRTC::PostEncodeUpdate(
+    uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params) {
+  cpi_->common.frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
+  cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
+  cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id;
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1) {
+    vp9_restore_layer_context(cpi_);
+    const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+                                       cpi_->svc.temporal_layer_id,
+                                       cpi_->svc.number_temporal_layers);
+    LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+    cpi_->common.base_qindex = lc->frame_qp;
+    cpi_->common.MBs = lc->MBs;
+    // For spatial-svc, allow cyclic-refresh to be applied on the spatial
+    // layers, for the base temporal layer.
+    if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi_->svc.number_spatial_layers > 1 &&
+        cpi_->svc.temporal_layer_id == 0) {
+      CYCLIC_REFRESH *const cr = cpi_->cyclic_refresh;
+      cr->qindex_delta[0] = lc->qindex_delta[0];
+      cr->qindex_delta[1] = lc->qindex_delta[1];
+      cr->qindex_delta[2] = lc->qindex_delta[2];
+    }
+  }
   vp9_rc_postencode_update(cpi_, encoded_frame_size);
   if (cpi_->svc.number_spatial_layers > 1 ||
       cpi_->svc.number_temporal_layers > 1)
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index 72ea40fd68..85005c5474 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -14,101 +14,100 @@
 #include <cstdint>
 #include <memory>
 
-#include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/vp9_iface_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/vp9_cx_iface.h"
+#include "vpx/internal/vpx_ratectrl_rtc.h"
 #include "vpx_mem/vpx_mem.h"
 
+struct VP9_COMP;
+
 namespace libvpx {
+struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
+ public:
+  VP9RateControlRtcConfig() {
+    ss_number_layers = 1;
+    vp9_zero(max_quantizers);
+    vp9_zero(min_quantizers);
+    vp9_zero(scaling_factor_den);
+    vp9_zero(scaling_factor_num);
+    vp9_zero(layer_target_bitrate);
+    vp9_zero(ts_rate_decimator);
+    scaling_factor_num[0] = 1;
+    scaling_factor_den[0] = 1;
+    max_quantizers[0] = max_quantizer;
+    min_quantizers[0] = min_quantizer;
+    max_consec_drop = INT_MAX;
+  }
 
-struct VP9RateControlRtcConfig {
-  int width;
-  int height;
-  // 0-63
-  int max_quantizer;
-  int min_quantizer;
-  int64_t target_bandwidth;
-  int64_t buf_initial_sz;
-  int64_t buf_optimal_sz;
-  int64_t buf_sz;
-  int undershoot_pct;
-  int overshoot_pct;
-  int max_intra_bitrate_pct;
-  double framerate;
   // Number of spatial layers
   int ss_number_layers;
-  // Number of temporal layers
-  int ts_number_layers;
   int max_quantizers[VPX_MAX_LAYERS];
   int min_quantizers[VPX_MAX_LAYERS];
   int scaling_factor_num[VPX_SS_MAX_LAYERS];
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
-  int layer_target_bitrate[VPX_MAX_LAYERS];
-  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+  // This is only for SVC for now.
+  int max_consec_drop;
 };
 
 struct VP9FrameParamsQpRTC {
-  FRAME_TYPE frame_type;
+  RcFrameType frame_type;
   int spatial_layer_id;
   int temporal_layer_id;
 };
 
+struct VP9SegmentationData {
+  const uint8_t *segmentation_map;
+  size_t segmentation_map_size;
+  const int *delta_q;
+  size_t delta_q_size;
+};
+
 // This interface allows using VP9 real-time rate control without initializing
-// the encoder. To use this interface, you need to link with libvp9rc.a.
+// the encoder. To use this interface, you need to link with libvpxrc.a.
 //
 // #include "vp9/ratectrl_rtc.h"
-// VP9RateControlRTC rc_api;
 // VP9RateControlRtcConfig cfg;
 // VP9FrameParamsQpRTC frame_params;
 //
 // YourFunctionToInitializeConfig(cfg);
-// rc_api.InitRateControl(cfg);
+// std::unique_ptr<VP9RateControlRTC> rc_api = VP9RateControlRTC::Create(cfg);
 // // start encoding
 // while (frame_to_encode) {
 //   if (config_changed)
-//     rc_api.UpdateRateControl(cfg);
+//     rc_api->UpdateRateControl(cfg);
 //   YourFunctionToFillFrameParams(frame_params);
-//   rc_api.ComputeQP(frame_params);
-//   YourFunctionToUseQP(rc_api.GetQP());
-//   YourFunctionToUseLoopfilter(rc_api.GetLoopfilterLevel());
+//   rc_api->ComputeQP(frame_params);
+//   YourFunctionToUseQP(rc_api->GetQP());
+//   YourFunctionToUseLoopfilter(rc_api->GetLoopfilterLevel());
 //   // After encoding
-//   rc_api.PostEncode(encoded_frame_size);
+//   rc_api->PostEncode(encoded_frame_size, frame_params);
 // }
 class VP9RateControlRTC {
  public:
   static std::unique_ptr<VP9RateControlRTC> Create(
       const VP9RateControlRtcConfig &cfg);
-  ~VP9RateControlRTC() {
-    if (cpi_) {
-      for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
-        for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
-          int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
-          LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
-          vpx_free(lc->map);
-          vpx_free(lc->last_coded_q_map);
-          vpx_free(lc->consec_zero_mv);
-        }
-      }
-      vpx_free(cpi_);
-    }
-  }
+  ~VP9RateControlRTC();
 
-  void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
+  bool UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
   int GetLoopfilterLevel() const;
-  void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
+  bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
+  // ComputeQP computes the QP if the frame is not dropped (kOk return),
+  // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+  // are not to be called (vp9_rc_postencode_update_drop_frame is already
+  // called via ComputeQP if drop is decided).
+  FrameDropDecision ComputeQP(const VP9FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
-  void PostEncodeUpdate(uint64_t encoded_frame_size);
+  void PostEncodeUpdate(uint64_t encoded_frame_size,
+                        const VP9FrameParamsQpRTC &frame_params);
 
  private:
   VP9RateControlRTC() {}
-  void InitRateControl(const VP9RateControlRtcConfig &cfg);
-  VP9_COMP *cpi_;
+  bool InitRateControl(const VP9RateControlRtcConfig &cfg);
+  struct VP9_COMP *cpi_;
 };
 
 }  // namespace libvpx
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index 9d62d26e1d..2e6f9a4513 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -90,18 +90,27 @@ static int img_read(vpx_image_t *img, FILE *file) {
   return 1;
 }
 
+// Assume every config in VP9EncoderConfig is less than 100 characters.
+#define ENCODE_CONFIG_BUF_SIZE 100
+struct EncodeConfig {
+  char name[ENCODE_CONFIG_BUF_SIZE];
+  char value[ENCODE_CONFIG_BUF_SIZE];
+};
+
 class SimpleEncode::EncodeImpl {
  public:
   VP9_COMP *cpi;
   vpx_img_fmt_t img_fmt;
   vpx_image_t tmp_img;
   std::vector<FIRSTPASS_STATS> first_pass_stats;
+  std::vector<EncodeConfig> encode_config_list;
 };
 
 static VP9_COMP *init_encoder(const VP9EncoderConfig *oxcf,
                               vpx_img_fmt_t img_fmt) {
   VP9_COMP *cpi;
   BufferPool *buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(*buffer_pool));
+  if (!buffer_pool) return NULL;
   vp9_initialize_enc();
   cpi = vp9_create_compressor(oxcf, buffer_pool);
   vp9_update_compressor_with_img_fmt(cpi, img_fmt);
@@ -134,7 +143,6 @@ get_frame_type_from_update_type(FRAME_UPDATE_TYPE update_type) {
     default:
       fprintf(stderr, "Unsupported update_type %d\n", update_type);
       abort();
-      return kFrameTypeInter;
   }
 }
 
@@ -167,16 +175,18 @@ static RefFrameType mv_ref_frame_to_ref_frame_type(
 
 static void update_motion_vector_info(
     const MOTION_VECTOR_INFO *input_motion_vector_info, const int num_rows_4x4,
-    const int num_cols_4x4, MotionVectorInfo *output_motion_vector_info) {
+    const int num_cols_4x4, MotionVectorInfo *output_motion_vector_info,
+    int motion_vector_scale) {
   const int num_units_4x4 = num_rows_4x4 * num_cols_4x4;
   for (int i = 0; i < num_units_4x4; ++i) {
     const MV_REFERENCE_FRAME *in_ref_frame =
         input_motion_vector_info[i].ref_frame;
     output_motion_vector_info[i].mv_count =
-        (in_ref_frame[0] == INTRA_FRAME) ? 0
-                                         : ((in_ref_frame[1] == NONE) ? 1 : 2);
-    if (in_ref_frame[0] == NONE) {
-      fprintf(stderr, "in_ref_frame[0] shouldn't be NONE\n");
+        (in_ref_frame[0] == INTRA_FRAME)
+            ? 0
+            : ((in_ref_frame[1] == NO_REF_FRAME) ? 1 : 2);
+    if (in_ref_frame[0] == NO_REF_FRAME) {
+      fprintf(stderr, "in_ref_frame[0] shouldn't be NO_REF_FRAME\n");
       abort();
     }
     output_motion_vector_info[i].ref_frame[0] =
@@ -185,16 +195,34 @@ static void update_motion_vector_info(
         mv_ref_frame_to_ref_frame_type(in_ref_frame[1]);
     output_motion_vector_info[i].mv_row[0] =
         (double)input_motion_vector_info[i].mv[0].as_mv.row /
-        kMotionVectorPrecision;
+        motion_vector_scale;
     output_motion_vector_info[i].mv_column[0] =
         (double)input_motion_vector_info[i].mv[0].as_mv.col /
-        kMotionVectorPrecision;
+        motion_vector_scale;
     output_motion_vector_info[i].mv_row[1] =
         (double)input_motion_vector_info[i].mv[1].as_mv.row /
-        kMotionVectorPrecision;
+        motion_vector_scale;
     output_motion_vector_info[i].mv_column[1] =
         (double)input_motion_vector_info[i].mv[1].as_mv.col /
-        kMotionVectorPrecision;
+        motion_vector_scale;
+  }
+}
+
+static void update_tpl_stats_info(const TplDepStats *input_tpl_stats_info,
+                                  const int show_frame_count,
+                                  TplStatsInfo *output_tpl_stats_info) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < show_frame_count; ++frame_idx) {
+    output_tpl_stats_info[frame_idx].intra_cost =
+        input_tpl_stats_info[frame_idx].intra_cost;
+    output_tpl_stats_info[frame_idx].inter_cost =
+        input_tpl_stats_info[frame_idx].inter_cost;
+    output_tpl_stats_info[frame_idx].mc_flow =
+        input_tpl_stats_info[frame_idx].mc_flow;
+    output_tpl_stats_info[frame_idx].mc_dep_cost =
+        input_tpl_stats_info[frame_idx].mc_dep_cost;
+    output_tpl_stats_info[frame_idx].mc_ref_cost =
+        input_tpl_stats_info[frame_idx].mc_ref_cost;
   }
 }
 
@@ -471,12 +499,13 @@ static bool init_encode_frame_result(EncodeFrameResult *encode_frame_result,
   encode_frame_result->coding_data.reset(
       new (std::nothrow) uint8_t[max_coding_data_byte_size]);
 
-  encode_frame_result->num_rows_4x4 = get_num_unit_4x4(frame_width);
-  encode_frame_result->num_cols_4x4 = get_num_unit_4x4(frame_height);
+  encode_frame_result->num_rows_4x4 = get_num_unit_4x4(frame_height);
+  encode_frame_result->num_cols_4x4 = get_num_unit_4x4(frame_width);
   encode_frame_result->partition_info.resize(encode_frame_result->num_rows_4x4 *
                                              encode_frame_result->num_cols_4x4);
   encode_frame_result->motion_vector_info.resize(
       encode_frame_result->num_rows_4x4 * encode_frame_result->num_cols_4x4);
+  encode_frame_result->tpl_stats_info.resize(MAX_LAG_BUFFERS);
 
   if (encode_frame_result->coding_data.get() == nullptr) {
     return false;
@@ -485,8 +514,20 @@ static bool init_encode_frame_result(EncodeFrameResult *encode_frame_result,
                            frame_height, img_fmt);
 }
 
+static void encode_frame_result_update_rq_history(
+    const RATE_QINDEX_HISTORY *rq_history,
+    EncodeFrameResult *encode_frame_result) {
+  encode_frame_result->recode_count = rq_history->recode_count;
+  for (int i = 0; i < encode_frame_result->recode_count; ++i) {
+    const int q_index = rq_history->q_index_history[i];
+    const int rate = rq_history->rate_history[i];
+    encode_frame_result->q_index_history.push_back(q_index);
+    encode_frame_result->rate_history.push_back(rate);
+  }
+}
+
 static void update_encode_frame_result(
-    EncodeFrameResult *encode_frame_result,
+    EncodeFrameResult *encode_frame_result, const int show_frame_count,
     const ENCODE_FRAME_RESULT *encode_frame_info) {
   encode_frame_result->coding_data_bit_size =
       encode_frame_result->coding_data_byte_size * 8;
@@ -511,9 +552,16 @@ static void update_encode_frame_result(
   update_motion_vector_info(encode_frame_info->motion_vector_info,
                             encode_frame_result->num_rows_4x4,
                             encode_frame_result->num_cols_4x4,
-                            &encode_frame_result->motion_vector_info[0]);
+                            &encode_frame_result->motion_vector_info[0],
+                            kMotionVectorSubPixelPrecision);
   update_frame_counts(&encode_frame_info->frame_counts,
                       &encode_frame_result->frame_counts);
+  if (encode_frame_result->frame_type == kFrameTypeAltRef) {
+    update_tpl_stats_info(encode_frame_info->tpl_stats_info, show_frame_count,
+                          &encode_frame_result->tpl_stats_info[0]);
+  }
+  encode_frame_result_update_rq_history(&encode_frame_info->rq_history,
+                                        encode_frame_result);
 }
 
 static void IncreaseGroupOfPictureIndex(GroupOfPicture *group_of_picture) {
@@ -695,9 +743,67 @@ static void UpdateGroupOfPicture(const VP9_COMP *cpi, int start_coding_index,
                     start_ref_frame_info, group_of_picture);
 }
 
+#define SET_STRUCT_VALUE(config, structure, ret, field) \
+  do {                                                  \
+    if (strcmp(config.name, #field) == 0) {             \
+      structure->field = atoi(config.value);            \
+      ret = 1;                                          \
+    }                                                   \
+  } while (false)
+
+static void UpdateEncodeConfig(const EncodeConfig &config,
+                               VP9EncoderConfig *oxcf) {
+  int ret = 0;
+  SET_STRUCT_VALUE(config, oxcf, ret, key_freq);
+  SET_STRUCT_VALUE(config, oxcf, ret, two_pass_vbrmin_section);
+  SET_STRUCT_VALUE(config, oxcf, ret, two_pass_vbrmax_section);
+  SET_STRUCT_VALUE(config, oxcf, ret, under_shoot_pct);
+  SET_STRUCT_VALUE(config, oxcf, ret, over_shoot_pct);
+  SET_STRUCT_VALUE(config, oxcf, ret, max_threads);
+  SET_STRUCT_VALUE(config, oxcf, ret, frame_parallel_decoding_mode);
+  SET_STRUCT_VALUE(config, oxcf, ret, tile_columns);
+  SET_STRUCT_VALUE(config, oxcf, ret, arnr_max_frames);
+  SET_STRUCT_VALUE(config, oxcf, ret, arnr_strength);
+  SET_STRUCT_VALUE(config, oxcf, ret, lag_in_frames);
+  SET_STRUCT_VALUE(config, oxcf, ret, encode_breakout);
+  SET_STRUCT_VALUE(config, oxcf, ret, enable_tpl_model);
+  SET_STRUCT_VALUE(config, oxcf, ret, enable_auto_arf);
+  if (strcmp(config.name, "rc_mode") == 0) {
+    int rc_mode = atoi(config.value);
+    if (rc_mode >= VPX_VBR && rc_mode <= VPX_Q) {
+      oxcf->rc_mode = (enum vpx_rc_mode)rc_mode;
+      ret = 1;
+    } else {
+      fprintf(stderr, "Invalid rc_mode value: %d\n", rc_mode);
+    }
+  }
+  SET_STRUCT_VALUE(config, oxcf, ret, cq_level);
+  if (ret == 0) {
+    fprintf(stderr, "Ignored unsupported encode_config %s\n", config.name);
+  }
+}
+
+static VP9EncoderConfig GetEncodeConfig(
+    int frame_width, int frame_height, vpx_rational_t frame_rate,
+    int target_bitrate, int encode_speed, int target_level,
+    vpx_enc_pass enc_pass,
+    const std::vector<EncodeConfig> &encode_config_list) {
+  VP9EncoderConfig oxcf = vp9_get_encoder_config(
+      frame_width, frame_height, frame_rate, target_bitrate, encode_speed,
+      target_level, enc_pass);
+  for (const auto &config : encode_config_list) {
+    UpdateEncodeConfig(config, &oxcf);
+  }
+  if (enc_pass == VPX_RC_FIRST_PASS) {
+    oxcf.lag_in_frames = 0;
+  }
+  oxcf.use_simple_encode_api = 1;
+  return oxcf;
+}
+
 SimpleEncode::SimpleEncode(int frame_width, int frame_height,
                            int frame_rate_num, int frame_rate_den,
-                           int target_bitrate, int num_frames,
+                           int target_bitrate, int num_frames, int target_level,
                            const char *infile_path, const char *outfile_path) {
   impl_ptr_ = std::unique_ptr<EncodeImpl>(new EncodeImpl());
   frame_width_ = frame_width;
@@ -706,6 +812,8 @@ SimpleEncode::SimpleEncode(int frame_width, int frame_height,
   frame_rate_den_ = frame_rate_den;
   target_bitrate_ = target_bitrate;
   num_frames_ = num_frames;
+  encode_speed_ = 0;
+  target_level_ = target_level;
 
   frame_coding_index_ = 0;
   show_frame_count_ = 0;
@@ -727,18 +835,57 @@ SimpleEncode::SimpleEncode(int frame_width, int frame_height,
   InitRefFrameInfo(&ref_frame_info_);
 }
 
+void SimpleEncode::SetEncodeSpeed(int encode_speed) {
+  encode_speed_ = encode_speed;
+}
+
+StatusCode SimpleEncode::SetEncodeConfig(const char *name, const char *value) {
+  if (name == nullptr || value == nullptr) {
+    fprintf(stderr, "SetEncodeConfig: null pointer, name %p value %p\n", name,
+            value);
+    return StatusError;
+  }
+  EncodeConfig config;
+  snprintf(config.name, ENCODE_CONFIG_BUF_SIZE, "%s", name);
+  snprintf(config.value, ENCODE_CONFIG_BUF_SIZE, "%s", value);
+  impl_ptr_->encode_config_list.push_back(config);
+  return StatusOk;
+}
+
+StatusCode SimpleEncode::DumpEncodeConfigs(int pass, FILE *fp) {
+  if (fp == nullptr) {
+    fprintf(stderr, "DumpEncodeConfigs: null pointer, fp %p\n", fp);
+    return StatusError;
+  }
+  vpx_enc_pass enc_pass;
+  if (pass == 1) {
+    enc_pass = VPX_RC_FIRST_PASS;
+  } else {
+    enc_pass = VPX_RC_LAST_PASS;
+  }
+  const vpx_rational_t frame_rate =
+      make_vpx_rational(frame_rate_num_, frame_rate_den_);
+  const VP9EncoderConfig oxcf = GetEncodeConfig(
+      frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
+      target_level_, enc_pass, impl_ptr_->encode_config_list);
+  vp9_dump_encoder_config(&oxcf, fp);
+  return StatusOk;
+}
+
 void SimpleEncode::ComputeFirstPassStats() {
   vpx_rational_t frame_rate =
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
-  const VP9EncoderConfig oxcf =
-      vp9_get_encoder_config(frame_width_, frame_height_, frame_rate,
-                             target_bitrate_, VPX_RC_FIRST_PASS);
-  VP9_COMP *cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
-  struct lookahead_ctx *lookahead = cpi->lookahead;
+  const VP9EncoderConfig oxcf = GetEncodeConfig(
+      frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
+      target_level_, VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list);
+  impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
+  struct lookahead_ctx *lookahead = impl_ptr_->cpi->lookahead;
   int i;
   int use_highbitdepth = 0;
+  const int num_rows_16x16 = get_num_unit_16x16(frame_height_);
+  const int num_cols_16x16 = get_num_unit_16x16(frame_width_);
 #if CONFIG_VP9_HIGHBITDEPTH
-  use_highbitdepth = cpi->common.use_highbitdepth;
+  use_highbitdepth = impl_ptr_->cpi->common.use_highbitdepth;
 #endif
   vpx_image_t img;
   vpx_img_alloc(&img, impl_ptr_->img_fmt, frame_width_, frame_height_, 1);
@@ -764,24 +911,35 @@ void SimpleEncode::ComputeFirstPassStats() {
         ENCODE_FRAME_RESULT encode_frame_info;
         vp9_init_encode_frame_result(&encode_frame_info);
         // TODO(angiebird): Call vp9_first_pass directly
-        vp9_get_compressed_data(cpi, &frame_flags, &size, nullptr, &time_stamp,
-                                &time_end, flush, &encode_frame_info);
+        vp9_get_compressed_data(impl_ptr_->cpi, &frame_flags, &size, nullptr,
+                                &time_stamp, &time_end, flush,
+                                &encode_frame_info);
         // vp9_get_compressed_data only generates first pass stats not
         // compresses data
         assert(size == 0);
+        // Get vp9 first pass motion vector info.
+        std::vector<MotionVectorInfo> mv_info(num_rows_16x16 * num_cols_16x16);
+        update_motion_vector_info(
+            impl_ptr_->cpi->fp_motion_vector_info, num_rows_16x16,
+            num_cols_16x16, mv_info.data(), kMotionVectorFullPixelPrecision);
+        fp_motion_vector_info_.push_back(mv_info);
       }
-      impl_ptr_->first_pass_stats.push_back(vp9_get_frame_stats(&cpi->twopass));
+      impl_ptr_->first_pass_stats.push_back(
+          vp9_get_frame_stats(&impl_ptr_->cpi->twopass));
     }
   }
-  vp9_end_first_pass(cpi);
   // TODO(angiebird): Store the total_stats apart form first_pass_stats
-  impl_ptr_->first_pass_stats.push_back(vp9_get_total_stats(&cpi->twopass));
-  free_encoder(cpi);
-  rewind(in_file_);
-  vpx_img_free(&img);
+  impl_ptr_->first_pass_stats.push_back(
+      vp9_get_total_stats(&impl_ptr_->cpi->twopass));
+  vp9_end_first_pass(impl_ptr_->cpi);
 
   // Generate key_frame_map based on impl_ptr_->first_pass_stats.
   key_frame_map_ = ComputeKeyFrameMap();
+
+  free_encoder(impl_ptr_->cpi);
+  impl_ptr_->cpi = nullptr;
+  rewind(in_file_);
+  vpx_img_free(&img);
 }
 
 std::vector<std::vector<double>> SimpleEncode::ObserveFirstPassStats() {
@@ -806,8 +964,16 @@ std::vector<std::vector<double>> SimpleEncode::ObserveFirstPassStats() {
   return output_stats;
 }
 
-void SimpleEncode::SetExternalGroupOfPicturesMap(std::vector<int> gop_map) {
-  gop_map_ = gop_map;
+std::vector<std::vector<MotionVectorInfo>>
+SimpleEncode::ObserveFirstPassMotionVectors() {
+  return fp_motion_vector_info_;
+}
+
+void SimpleEncode::SetExternalGroupOfPicturesMap(int *gop_map,
+                                                 int gop_map_size) {
+  for (int i = 0; i < gop_map_size; ++i) {
+    gop_map_.push_back(gop_map[i]);
+  }
   // The following will check and modify gop_map_ to make sure the
   // gop_map_ satisfies the constraints.
   // 1) Each key frame position should be at the start of a gop.
@@ -848,10 +1014,9 @@ T *GetVectorData(const std::vector<T> &v) {
 
 static GOP_COMMAND GetGopCommand(const std::vector<int> &gop_map,
                                  int start_show_index) {
-  assert(static_cast<size_t>(start_show_index) < gop_map.size());
-  assert((gop_map[start_show_index] & kGopMapFlagStart) != 0);
   GOP_COMMAND gop_command;
-  if (gop_map.size() > 0) {
+  if (static_cast<size_t>(start_show_index) < gop_map.size()) {
+    assert((gop_map[start_show_index] & kGopMapFlagStart) != 0);
     int end_show_index = start_show_index + 1;
     // gop_map[end_show_index] & kGopMapFlagStart == 0 means this is
     // the start of a gop.
@@ -876,9 +1041,10 @@ void SimpleEncode::StartEncode() {
   assert(impl_ptr_->first_pass_stats.size() > 0);
   vpx_rational_t frame_rate =
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
-  VP9EncoderConfig oxcf =
-      vp9_get_encoder_config(frame_width_, frame_height_, frame_rate,
-                             target_bitrate_, VPX_RC_LAST_PASS);
+  VP9EncoderConfig oxcf = GetEncodeConfig(
+      frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
+      target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
+
   vpx_fixed_buf_t stats;
   stats.buf = GetVectorData(impl_ptr_->first_pass_stats);
   stats.sz = sizeof(impl_ptr_->first_pass_stats[0]) *
@@ -893,6 +1059,11 @@ void SimpleEncode::StartEncode() {
   frame_coding_index_ = 0;
   show_frame_count_ = 0;
 
+  assert(impl_ptr_->cpi != nullptr);
+  FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
+  unsigned int screen_area = frame_info.frame_width * frame_info.frame_height;
+  vp9_init_vizier_params(&impl_ptr_->cpi->twopass, screen_area);
+
   UpdateKeyFrameGroup(show_frame_count_);
 
   const GOP_COMMAND gop_command = GetGopCommand(gop_map_, show_frame_count_);
@@ -928,8 +1099,7 @@ void SimpleEncode::UpdateKeyFrameGroup(int key_frame_show_index) {
   const VP9_COMP *cpi = impl_ptr_->cpi;
   key_frame_group_index_ = 0;
   key_frame_group_size_ = vp9_get_frames_to_next_key(
-      &cpi->oxcf, &cpi->frame_info, &cpi->twopass.first_pass_info,
-      key_frame_show_index, cpi->rc.min_gf_interval);
+      &cpi->oxcf, &cpi->twopass, key_frame_show_index, cpi->rc.min_gf_interval);
   assert(key_frame_group_size_ > 0);
   // Init the reference frame info when a new key frame group appears.
   InitRefFrameInfo(&ref_frame_info_);
@@ -1046,7 +1216,10 @@ void SimpleEncode::EncodeFrame(EncodeFrameResult *encode_frame_result) {
       abort();
     }
 
-    update_encode_frame_result(encode_frame_result, &encode_frame_info);
+    const GroupOfPicture group_of_picture = this->ObserveGroupOfPicture();
+    const int show_frame_count = group_of_picture.show_frame_count;
+    update_encode_frame_result(encode_frame_result, show_frame_count,
+                               &encode_frame_info);
     PostUpdateState(*encode_frame_result);
   } else {
     // TODO(angiebird): Clean up encode_frame_result.
@@ -1063,6 +1236,15 @@ void SimpleEncode::EncodeFrameWithQuantizeIndex(
   encode_command_reset_external_quantize_index(&impl_ptr_->cpi->encode_command);
 }
 
+void SimpleEncode::EncodeFrameWithTargetFrameBits(
+    EncodeFrameResult *encode_frame_result, int target_frame_bits,
+    double percent_diff) {
+  encode_command_set_target_frame_bits(&impl_ptr_->cpi->encode_command,
+                                       target_frame_bits, percent_diff);
+  EncodeFrame(encode_frame_result);
+  encode_command_reset_target_frame_bits(&impl_ptr_->cpi->encode_command);
+}
+
 static int GetCodingFrameNumFromGopMap(const std::vector<int> &gop_map) {
   int start_show_index = 0;
   int coding_frame_count = 0;
@@ -1071,7 +1253,7 @@ static int GetCodingFrameNumFromGopMap(const std::vector<int> &gop_map) {
     start_show_index += gop_command.show_frame_count;
     coding_frame_count += gop_command_coding_frame_count(&gop_command);
   }
-  assert(start_show_index == gop_map.size());
+  assert(static_cast<size_t>(start_show_index) == gop_map.size());
   return coding_frame_count;
 }
 
@@ -1082,37 +1264,39 @@ int SimpleEncode::GetCodingFrameNum() const {
   }
 
   // These are the default settings for now.
+  TWO_PASS twopass;
   const int multi_layer_arf = 0;
   const int allow_alt_ref = 1;
   vpx_rational_t frame_rate =
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
-  const VP9EncoderConfig oxcf =
-      vp9_get_encoder_config(frame_width_, frame_height_, frame_rate,
-                             target_bitrate_, VPX_RC_LAST_PASS);
+  const VP9EncoderConfig oxcf = GetEncodeConfig(
+      frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
+      target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
   FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
-  FIRST_PASS_INFO first_pass_info;
-  fps_init_first_pass_info(&first_pass_info,
+  fps_init_first_pass_info(&twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
                            num_frames_);
-  return vp9_get_coding_frame_num(&oxcf, &frame_info, &first_pass_info,
-                                  multi_layer_arf, allow_alt_ref);
+  unsigned int screen_area = frame_info.frame_width * frame_info.frame_height;
+  vp9_init_vizier_params(&twopass, screen_area);
+  return vp9_get_coding_frame_num(&oxcf, &twopass, &frame_info, multi_layer_arf,
+                                  allow_alt_ref);
 }
 
 std::vector<int> SimpleEncode::ComputeKeyFrameMap() const {
-  assert(impl_ptr_->first_pass_stats.size() == num_frames_);
+  // The last entry of first_pass_stats is the overall stats.
+  assert(impl_ptr_->first_pass_stats.size() ==
+         static_cast<size_t>(num_frames_) + 1);
   vpx_rational_t frame_rate =
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
-  const VP9EncoderConfig oxcf =
-      vp9_get_encoder_config(frame_width_, frame_height_, frame_rate,
-                             target_bitrate_, VPX_RC_LAST_PASS);
-  FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
-  FIRST_PASS_INFO first_pass_info;
-  fps_init_first_pass_info(&first_pass_info,
+  const VP9EncoderConfig oxcf = GetEncodeConfig(
+      frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
+      target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
+  TWO_PASS twopass;
+  fps_init_first_pass_info(&twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
                            num_frames_);
   std::vector<int> key_frame_map(num_frames_, 0);
-  vp9_get_key_frame_map(&oxcf, &frame_info, &first_pass_info,
-                        GetVectorData(key_frame_map));
+  vp9_get_key_frame_map(&oxcf, &twopass, GetVectorData(key_frame_map));
   return key_frame_map;
 }
 
diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h
index d7c9dfa140..d610a5e159 100644
--- a/vp9/simple_encode.h
+++ b/vp9/simple_encode.h
@@ -19,13 +19,18 @@
 
 namespace vp9 {
 
+enum StatusCode {
+  StatusOk = 0,
+  StatusError,
+};
+
 // TODO(angiebird): Add description for each frame type.
 enum FrameType {
   kFrameTypeKey = 0,
-  kFrameTypeInter,
-  kFrameTypeAltRef,
-  kFrameTypeOverlay,
-  kFrameTypeGolden,
+  kFrameTypeInter = 1,
+  kFrameTypeAltRef = 2,
+  kFrameTypeOverlay = 3,
+  kFrameTypeGolden = 4,
 };
 
 // TODO(angiebird): Add description for each reference frame type.
@@ -39,6 +44,26 @@ enum RefFrameType {
   kRefFrameTypeNone = -1,
 };
 
+enum VP9_LEVEL {
+  LEVEL_UNKNOWN = 0,
+  LEVEL_AUTO = 1,
+  LEVEL_1 = 10,
+  LEVEL_1_1 = 11,
+  LEVEL_2 = 20,
+  LEVEL_2_1 = 21,
+  LEVEL_3 = 30,
+  LEVEL_3_1 = 31,
+  LEVEL_4 = 40,
+  LEVEL_4_1 = 41,
+  LEVEL_5 = 50,
+  LEVEL_5_1 = 51,
+  LEVEL_5_2 = 52,
+  LEVEL_6 = 60,
+  LEVEL_6_1 = 61,
+  LEVEL_6_2 = 62,
+  LEVEL_MAX = 255
+};
+
 enum GopMapFlag {
   kGopMapFlagStart =
       1 << 0,  // Indicate this location is the start of a group of pictures.
@@ -58,9 +83,12 @@ struct PartitionInfo {
   int height;        // prediction block height
 };
 
-constexpr int kMotionVectorPrecision = 8;
+constexpr int kMotionVectorSubPixelPrecision = 8;
+constexpr int kMotionVectorFullPixelPrecision = 1;
 
-// The frame is split to 4x4 blocks.
+// In the first pass. The frame is split to 16x16 blocks.
+// This structure contains the information of each 16x16 block.
+// In the second pass. The frame is split to 4x4 blocks.
 // This structure contains the information of each 4x4 block.
 struct MotionVectorInfo {
   // Number of valid motion vectors, always 0 if this block is in the key frame.
@@ -68,8 +96,8 @@ struct MotionVectorInfo {
   int mv_count;
   // The reference frame for motion vectors. If the second motion vector does
   // not exist (mv_count = 1), the reference frame is kNoneRefFrame.
-  // Otherwise, the reference frame is either kLastFrame, or kGoldenFrame,
-  // or kAltRefFrame.
+  // Otherwise, the reference frame is either kRefFrameTypeLast, or
+  // kRefFrameTypePast, or kRefFrameTypeFuture.
   RefFrameType ref_frame[2];
   // The row offset of motion vectors in the unit of pixel.
   // If the second motion vector does not exist, the value is 0.
@@ -79,6 +107,24 @@ struct MotionVectorInfo {
   double mv_column[2];
 };
 
+// Accumulated tpl stats of all blocks in one frame.
+// For each frame, the tpl stats are computed per 32x32 block.
+struct TplStatsInfo {
+  // Intra complexity: the sum of absolute transform difference (SATD) of
+  // intra predicted residuals.
+  int64_t intra_cost;
+  // Inter complexity: the SATD of inter predicted residuals.
+  int64_t inter_cost;
+  // Motion compensated information flow. It measures how much information
+  // is propagated from the current frame to other frames.
+  int64_t mc_flow;
+  // Motion compensated dependency cost. It equals to its own intra_cost
+  // plus the mc_flow.
+  int64_t mc_dep_cost;
+  // Motion compensated reference cost.
+  int64_t mc_ref_cost;
+};
+
 struct RefFrameInfo {
   int coding_indexes[kRefFrameTypeMax];
 
@@ -245,7 +291,7 @@ struct EncodeFrameResult {
   std::vector<PartitionInfo> partition_info;
   // A vector of the motion vector information of the frame.
   // The number of elements is |num_rows_4x4| * |num_cols_4x4|.
-  // The frame is divided 4x4 blocks of |num_rows_4x4| rows and
+  // The frame is divided into 4x4 blocks of |num_rows_4x4| rows and
   // |num_cols_4x4| columns.
   // Each 4x4 block contains 0 motion vector if this is an intra predicted
   // frame (for example, the key frame). If the frame is inter predicted,
@@ -253,7 +299,25 @@ struct EncodeFrameResult {
   // Similar to partition info, all 4x4 blocks inside the same partition block
   // share the same motion vector information.
   std::vector<MotionVectorInfo> motion_vector_info;
+  // A vector of the tpl stats information.
+  // The tpl stats measure the complexity of a frame, as well as the
+  // information propagated along the motion trajectory between frames, in
+  // the reference frame structure.
+  // The tpl stats could be used as a more accurate spatial and temporal
+  // complexity measure in addition to the first pass stats.
+  // The vector contains tpl stats for all show frames in a GOP.
+  // The tpl stats stored in the vector is according to the encoding order.
+  // For example, suppose there are N show frames for the current GOP.
+  // Then tpl_stats_info[0] stores the information of the first frame to be
+  // encoded for this GOP, i.e., the AltRef frame.
+  std::vector<TplStatsInfo> tpl_stats_info;
   ImageBuffer coded_frame;
+
+  // recode_count, q_index_history and rate_history are only available when
+  // EncodeFrameWithTargetFrameBits() is used.
+  int recode_count;
+  std::vector<int> q_index_history;
+  std::vector<int> rate_history;
 };
 
 struct GroupOfPicture {
@@ -299,11 +363,58 @@ class SimpleEncode {
   // format.
   SimpleEncode(int frame_width, int frame_height, int frame_rate_num,
                int frame_rate_den, int target_bitrate, int num_frames,
-               const char *infile_path, const char *outfile_path = nullptr);
+               int target_level, const char *infile_path,
+               const char *outfile_path = nullptr);
   ~SimpleEncode();
   SimpleEncode(SimpleEncode &) = delete;
   SimpleEncode &operator=(const SimpleEncode &) = delete;
 
+  // Adjusts the encoder's coding speed.
+  // If this function is not called, the encoder will use default encode_speed
+  // 0. Call this function before ComputeFirstPassStats() if needed.
+  // The encode_speed is equivalent to --cpu-used of the vpxenc command.
+  // The encode_speed's range should be [0, 9].
+  // Setting the encode_speed to a higher level will yield faster coding
+  // at the cost of lower compression efficiency.
+  void SetEncodeSpeed(int encode_speed);
+
+  // Set encoder config
+  // The following configs in VP9EncoderConfig are allowed to change in this
+  // function. See https://ffmpeg.org/ffmpeg-codecs.html#libvpx for each
+  // config's meaning.
+  // Configs in VP9EncoderConfig:          Equivalent configs in ffmpeg:
+  // 1  key_freq                           -g
+  // 2  two_pass_vbrmin_section            -minrate * 100LL / bit_rate
+  // 3  two_pass_vbrmax_section            -maxrate * 100LL / bit_rate
+  // 4  under_shoot_pct                    -undershoot-pct
+  // 5  over_shoot_pct                     -overshoot-pct
+  // 6  max_threads                        -threads
+  // 7  frame_parallel_decoding_mode       -frame-parallel
+  // 8  tile_column                        -tile-columns
+  // 9  arnr_max_frames                    -arnr-maxframes
+  // 10 arnr_strength                      -arnr-strength
+  // 11 lag_in_frames                      -rc_lookahead
+  // 12 encode_breakout                    -static-thresh
+  // 13 enable_tpl_model                   -enable-tpl
+  // 14 enable_auto_arf                    -auto-alt-ref
+  // 15 rc_mode
+  //    Possible Settings:
+  //      0 - Variable Bit Rate (VPX_VBR)  -b:v <bit_rate>
+  //      1 - Constant Bit Rate (VPX_CBR)  -b:v <bit_rate> -minrate <bit_rate>
+  //                                        -maxrate <bit_rate>
+  //        two_pass_vbrmin_section == 100   i.e. bit_rate == minrate == maxrate
+  //        two_pass_vbrmax_section == 100
+  //      2 - Constrained Quality (VPX_CQ) -crf <cq_level> -b:v bit_rate
+  //      3 - Constant Quality (VPX_Q)     -crf <cq_level> -b:v 0
+  //    See https://trac.ffmpeg.org/wiki/Encode/VP9 for more details.
+  // 16 cq_level                          see rc_mode for details.
+  StatusCode SetEncodeConfig(const char *name, const char *value);
+
+  // A debug function that dumps configs from VP9EncoderConfig
+  // pass = 1: first pass, pass = 2: second pass
+  // fp: file pointer for dumping config
+  StatusCode DumpEncodeConfigs(int pass, FILE *fp);
+
   // Makes encoder compute the first pass stats and store it at
   // impl_ptr_->first_pass_stats. key_frame_map_ is also computed based on the
   // first pass stats.
@@ -315,6 +426,12 @@ class SimpleEncode {
   // values. For details, please check FIRSTPASS_STATS in vp9_firstpass.h
   std::vector<std::vector<double>> ObserveFirstPassStats();
 
+  // Outputs the first pass motion vectors represented by a 2-D vector.
+  // One can use the frame index at first dimension to retrieve the mvs for
+  // each video frame. The frame is divided into 16x16 blocks. The number of
+  // elements is round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4).
+  std::vector<std::vector<MotionVectorInfo>> ObserveFirstPassMotionVectors();
+
   // Ouputs a copy of key_frame_map_, a binary vector with size equal to the
   // number of show frames in the video. For each entry in the vector, 1
   // indicates the position is a key frame and 0 indicates it's not a key frame.
@@ -335,7 +452,7 @@ class SimpleEncode {
   // constraints.
   // 1) Each key frame position should be at the start of a gop.
   // 2) The last gop should not use an alt ref.
-  void SetExternalGroupOfPicturesMap(std::vector<int> gop_map);
+  void SetExternalGroupOfPicturesMap(int *gop_map, int gop_map_size);
 
   // Observe the group of pictures map set through
   // SetExternalGroupOfPicturesMap(). This function should be called after
@@ -373,6 +490,17 @@ class SimpleEncode {
   void EncodeFrameWithQuantizeIndex(EncodeFrameResult *encode_frame_result,
                                     int quantize_index);
 
+  // Encode a frame with target frame bits usage.
+  // The encoder will find a quantize index to make the actual frame bits usage
+  // match the target. EncodeFrameWithTargetFrameBits() will recode the frame
+  // up to 7 times to find a q_index to make the actual_frame_bits satisfy the
+  // following inequality. |actual_frame_bits - target_frame_bits| * 100 /
+  // target_frame_bits
+  // <= percent_diff.
+  void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result,
+                                      int target_frame_bits,
+                                      double percent_diff);
+
   // Gets the number of coding frames for the video. The coding frames include
   // show frame and no show frame.
   // This function should be called after ComputeFirstPassStats().
@@ -405,6 +533,8 @@ class SimpleEncode {
   int frame_rate_den_;
   int target_bitrate_;
   int num_frames_;
+  int encode_speed_;
+  int target_level_;
 
   std::FILE *in_file_;
   std::FILE *out_file_;
@@ -435,6 +565,17 @@ class SimpleEncode {
   // frame appears?
   // Reference frames info of the to-be-coded frame.
   RefFrameInfo ref_frame_info_;
+
+  // A 2-D vector of motion vector information of the frame collected
+  // from the first pass. The first dimension is the frame index.
+  // Each frame is divided into 16x16 blocks. The number of elements is
+  // round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4).
+  // Each 16x16 block contains 0 motion vector if this is an intra predicted
+  // frame (for example, the key frame). If the frame is inter predicted,
+  // each 16x16 block contains either 1 or 2 motion vectors.
+  // The first motion vector is always from the LAST_FRAME.
+  // The second motion vector is always from the GOLDEN_FRAME.
+  std::vector<std::vector<MotionVectorInfo>> fp_motion_vector_info_;
 };
 
 }  // namespace vp9
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 9074e1b4ec..2e2c8176d4 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -13,8 +13,8 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_encoder.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/psnr.h"
-#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_util/vpx_timestamp.h"
@@ -23,12 +23,15 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vpx/vp8cx.h"
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_scale.h"
 #include "vp9/vp9_cx_iface.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/vp9_cx_iface.h"
 #include "vp9/vp9_iface_common.h"
 
+#include "vpx/vpx_tpl.h"
+
 typedef struct vp9_extracfg {
   int cpu_used;  // available cpu percentage in 1/16
   unsigned int enable_auto_alt_ref;
@@ -65,7 +68,11 @@ typedef struct vp9_extracfg {
 } vp9_extracfg;
 
 static struct vp9_extracfg default_extra_cfg = {
-  0,                     // cpu_used
+#if CONFIG_REALTIME_ONLY
+  5,  // cpu_used
+#else
+  0,  // cpu_used
+#endif
   1,                     // enable_auto_alt_ref
   0,                     // noise_sensitivity
   0,                     // sharpness
@@ -125,6 +132,10 @@ struct vpx_codec_alg_priv {
   BufferPool *buffer_pool;
 };
 
+// Called by encoder_set_config() and encoder_encode() only. Must not be called
+// by encoder_init() because the `error` paramerer (cpi->common.error) will be
+// destroyed by vpx_codec_enc_init_ver() after encoder_init() returns an error.
+// See the "IMPORTANT" comment in vpx_codec_enc_init_ver().
 static vpx_codec_err_t update_error_state(
     vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) {
   const vpx_codec_err_t res = error->error_code;
@@ -166,8 +177,8 @@ static vpx_codec_err_t update_error_state(
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
                                        const struct vp9_extracfg *extra_cfg) {
-  RANGE_CHECK(cfg, g_w, 1, 65535);  // 16 bits available
-  RANGE_CHECK(cfg, g_h, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_w, 1, 65536);  // 16 bits available
+  RANGE_CHECK(cfg, g_h, 1, 65536);  // 16 bits available
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
   RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000);
   RANGE_CHECK_HI(cfg, g_profile, 3);
@@ -347,6 +358,24 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   }
   RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
   RANGE_CHECK(extra_cfg, color_range, VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE);
+
+  // The range below shall be further tuned.
+  RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
+  RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000);
+
   return VPX_CODEC_OK;
 }
 
@@ -362,8 +391,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
     case VPX_IMG_FMT_I440:
       if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
         ERROR(
-            "Invalid image format. I422, I444, I440, NV12 images are "
-            "not supported in profile.");
+            "Invalid image format. I422, I444, I440 images are not supported "
+            "in profile.");
       }
       break;
     case VPX_IMG_FMT_I42216:
@@ -378,8 +407,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
       break;
     default:
       ERROR(
-          "Invalid image format. Only YV12, I420, I422, I444 images are "
-          "supported.");
+          "Invalid image format. Only YV12, I420, I422, I444, I440, NV12 "
+          "images are supported.");
       break;
   }
 
@@ -504,8 +533,9 @@ static vpx_codec_err_t set_encoder_config(
   raw_target_rate =
       (unsigned int)((int64_t)oxcf->width * oxcf->height * oxcf->bit_depth * 3 *
                      oxcf->init_framerate / 1000);
-  // Cap target bitrate to raw rate
-  cfg->rc_target_bitrate = VPXMIN(raw_target_rate, cfg->rc_target_bitrate);
+  // Cap target bitrate to raw rate or 1000Mbps, whichever is less
+  cfg->rc_target_bitrate =
+      VPXMIN(VPXMIN(raw_target_rate, cfg->rc_target_bitrate), 1000000);
 
   // Convert target bandwidth from Kbit/s to Bit/s
   oxcf->target_bandwidth = 1000 * (int64_t)cfg->rc_target_bitrate;
@@ -564,10 +594,6 @@ static vpx_codec_err_t set_encoder_config(
 
   vp9_set_first_pass_stats(oxcf, &cfg->rc_twopass_stats_in);
 
-#if CONFIG_FP_MB_STATS
-  oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
-#endif
-
   oxcf->color_space = extra_cfg->color_space;
   oxcf->color_range = extra_cfg->color_range;
   oxcf->render_width = extra_cfg->render_width;
@@ -616,8 +642,12 @@ static vpx_codec_err_t set_encoder_config(
 
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
     for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
-      oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] =
-          1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl];
+      const int layer = sl * oxcf->ts_number_layers + tl;
+      if (cfg->layer_target_bitrate[layer] > INT_MAX / 1000)
+        oxcf->layer_target_bitrate[layer] = INT_MAX;
+      else
+        oxcf->layer_target_bitrate[layer] =
+            1000 * cfg->layer_target_bitrate[layer];
     }
   }
   if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) {
@@ -633,22 +663,159 @@ static vpx_codec_err_t set_encoder_config(
   }
 
   if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf);
-  // vp9_dump_encoder_config(oxcf);
+  oxcf->use_simple_encode_api = 0;
+  // vp9_dump_encoder_config(oxcf, stderr);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t set_twopass_params_from_config(
+    const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) {
+  if (!cfg->use_vizier_rc_params) return VPX_CODEC_OK;
+  if (cpi == NULL) return VPX_CODEC_ERROR;
+
+  cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params;
+
+  // The values set here are factors that will be applied to default values
+  // to get the final value used in the two pass code. Hence 1.0 will
+  // match the default behaviour when not using passed in values.
+  // We also apply limits here to prevent the user from applying settings
+  // that make no sense.
+  cpi->twopass.active_wq_factor =
+      (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den;
+  if (cpi->twopass.active_wq_factor < 0.25)
+    cpi->twopass.active_wq_factor = 0.25;
+  else if (cpi->twopass.active_wq_factor > 16.0)
+    cpi->twopass.active_wq_factor = 16.0;
+
+  cpi->twopass.err_per_mb =
+      (double)cfg->err_per_mb_factor.num / (double)cfg->err_per_mb_factor.den;
+  if (cpi->twopass.err_per_mb < 0.25)
+    cpi->twopass.err_per_mb = 0.25;
+  else if (cpi->twopass.err_per_mb > 4.0)
+    cpi->twopass.err_per_mb = 4.0;
+
+  cpi->twopass.sr_default_decay_limit =
+      (double)cfg->sr_default_decay_limit.num /
+      (double)cfg->sr_default_decay_limit.den;
+  if (cpi->twopass.sr_default_decay_limit < 0.25)
+    cpi->twopass.sr_default_decay_limit = 0.25;
+  // If the default changes this will need to change.
+  else if (cpi->twopass.sr_default_decay_limit > 1.33)
+    cpi->twopass.sr_default_decay_limit = 1.33;
+
+  cpi->twopass.sr_diff_factor =
+      (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den;
+  if (cpi->twopass.sr_diff_factor < 0.25)
+    cpi->twopass.sr_diff_factor = 0.25;
+  else if (cpi->twopass.sr_diff_factor > 4.0)
+    cpi->twopass.sr_diff_factor = 4.0;
+
+  cpi->twopass.kf_err_per_mb = (double)cfg->kf_err_per_mb_factor.num /
+                               (double)cfg->kf_err_per_mb_factor.den;
+  if (cpi->twopass.kf_err_per_mb < 0.25)
+    cpi->twopass.kf_err_per_mb = 0.25;
+  else if (cpi->twopass.kf_err_per_mb > 4.0)
+    cpi->twopass.kf_err_per_mb = 4.0;
+
+  cpi->twopass.kf_frame_min_boost = (double)cfg->kf_frame_min_boost_factor.num /
+                                    (double)cfg->kf_frame_min_boost_factor.den;
+  if (cpi->twopass.kf_frame_min_boost < 0.25)
+    cpi->twopass.kf_frame_min_boost = 0.25;
+  else if (cpi->twopass.kf_frame_min_boost > 4.0)
+    cpi->twopass.kf_frame_min_boost = 4.0;
+
+  cpi->twopass.kf_frame_max_boost_first =
+      (double)cfg->kf_frame_max_boost_first_factor.num /
+      (double)cfg->kf_frame_max_boost_first_factor.den;
+  if (cpi->twopass.kf_frame_max_boost_first < 0.25)
+    cpi->twopass.kf_frame_max_boost_first = 0.25;
+  else if (cpi->twopass.kf_frame_max_boost_first > 4.0)
+    cpi->twopass.kf_frame_max_boost_first = 4.0;
+
+  cpi->twopass.kf_frame_max_boost_subs =
+      (double)cfg->kf_frame_max_boost_subs_factor.num /
+      (double)cfg->kf_frame_max_boost_subs_factor.den;
+  if (cpi->twopass.kf_frame_max_boost_subs < 0.25)
+    cpi->twopass.kf_frame_max_boost_subs = 0.25;
+  else if (cpi->twopass.kf_frame_max_boost_subs > 4.0)
+    cpi->twopass.kf_frame_max_boost_subs = 4.0;
+
+  cpi->twopass.kf_max_total_boost = (double)cfg->kf_max_total_boost_factor.num /
+                                    (double)cfg->kf_max_total_boost_factor.den;
+  if (cpi->twopass.kf_max_total_boost < 0.25)
+    cpi->twopass.kf_max_total_boost = 0.25;
+  else if (cpi->twopass.kf_max_total_boost > 4.0)
+    cpi->twopass.kf_max_total_boost = 4.0;
+
+  cpi->twopass.gf_max_total_boost = (double)cfg->gf_max_total_boost_factor.num /
+                                    (double)cfg->gf_max_total_boost_factor.den;
+  if (cpi->twopass.gf_max_total_boost < 0.25)
+    cpi->twopass.gf_max_total_boost = 0.25;
+  else if (cpi->twopass.gf_max_total_boost > 4.0)
+    cpi->twopass.gf_max_total_boost = 4.0;
+
+  cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost_factor.num /
+                                    (double)cfg->gf_frame_max_boost_factor.den;
+  if (cpi->twopass.gf_frame_max_boost < 0.25)
+    cpi->twopass.gf_frame_max_boost = 0.25;
+  else if (cpi->twopass.gf_frame_max_boost > 4.0)
+    cpi->twopass.gf_frame_max_boost = 4.0;
+
+  cpi->twopass.zm_factor =
+      (double)cfg->zm_factor.num / (double)cfg->zm_factor.den;
+  if (cpi->twopass.zm_factor < 0.25)
+    cpi->twopass.zm_factor = 0.25;
+  else if (cpi->twopass.zm_factor > 2.0)
+    cpi->twopass.zm_factor = 2.0;
+
+  cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num /
+                                      (double)cfg->rd_mult_inter_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_inter_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_inter_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_inter_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_inter_qp_fac = 4.0;
+
+  cpi->rd_ctrl.rd_mult_arf_qp_fac =
+      (double)cfg->rd_mult_arf_qp_fac.num / (double)cfg->rd_mult_arf_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_arf_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_arf_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_arf_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_arf_qp_fac = 4.0;
+
+  cpi->rd_ctrl.rd_mult_key_qp_fac =
+      (double)cfg->rd_mult_key_qp_fac.num / (double)cfg->rd_mult_key_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_key_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_key_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_key_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_key_qp_fac = 4.0;
+
   return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
-  int force_key = 0;
+  volatile int force_key = 0;
 
   if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
       ERROR("Cannot change width or height after initialization");
-    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+    // Note: function encoder_set_config() is allowed to be called multiple
+    // times. However, when the original frame width or height is less than two
+    // times of the new frame width or height, a forced key frame should be
+    // used. To make sure the correct detection of a forced key frame, we need
+    // to update the frame width and height only when the actual encoding is
+    // performed. cpi->last_coded_width and cpi->last_coded_height are used to
+    // track the actual coded frame size.
+    if ((ctx->cpi->last_coded_width && ctx->cpi->last_coded_height &&
+         !valid_ref_frame_size(ctx->cpi->last_coded_width,
+                               ctx->cpi->last_coded_height, cfg->g_w,
+                               cfg->g_h)) ||
         (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
-        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+        (ctx->cpi->initial_height &&
+         (int)cfg->g_h > ctx->cpi->initial_height)) {
       force_key = 1;
+    }
   }
 
   // Prevent increasing lag_in_frames. This check is stricter than it needs
@@ -659,18 +826,29 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->extra_cfg);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (res == VPX_CODEC_OK) {
-    ctx->cfg = *cfg;
-    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    // On profile change, request a key frame
-    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
-    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
+  ctx->cpi->common.error.setjmp = 1;
+
+  ctx->cfg = *cfg;
+  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+  set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
+  // On profile change, request a key frame
+  force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+  vp9_change_config(ctx->cpi, &ctx->oxcf);
 
   if (force_key) ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
 
-  return res;
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
@@ -689,12 +867,32 @@ static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_quantizer_svc_layers(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  int *const arg = va_arg(args, int *);
+  int i;
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  for (i = 0; i < VPX_SS_MAX_LAYERS; i++) {
+    arg[i] = ctx->cpi->svc.base_qindex[i];
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_get_loopfilter_level(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = ctx->cpi->common.lf.filter_level;
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
                                         const struct vp9_extracfg *extra_cfg) {
   const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
   if (res == VPX_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
   return res;
@@ -840,6 +1038,7 @@ static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args);
+  if (ctx->cpi->fixed_qp_onepass) extra_cfg.aq_mode = 0;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -885,6 +1084,19 @@ static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  if (data) {
+    cpi->compute_frame_low_motion_onepass = 0;
+    cpi->rc.constrain_gf_key_freq_onepass_vbr = 0;
+    cpi->cyclic_refresh->content_mode = 0;
+    cpi->disable_scene_detection_rtc_ratectrl = 1;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_enable_motion_vector_unit_test(
     vpx_codec_alg_priv_t *ctx, va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -922,7 +1134,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     }
 
     priv->extra_cfg = default_extra_cfg;
-    once(vp9_initialize_enc);
+    vp9_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
@@ -939,6 +1151,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 #endif
       priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
       if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR;
+      set_twopass_params_from_config(&priv->cfg, priv->cpi);
     }
   }
 
@@ -955,7 +1168,7 @@ static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
 
 static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
                                     unsigned long duration,
-                                    unsigned long deadline) {
+                                    vpx_enc_deadline_t deadline) {
   MODE new_mode = BEST;
 
 #if CONFIG_REALTIME_ONLY
@@ -1100,7 +1313,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_pts_t pts_val,
                                       unsigned long duration,
                                       vpx_enc_frame_flags_t enc_flags,
-                                      unsigned long deadline) {
+                                      vpx_enc_deadline_t deadline) {
   volatile vpx_codec_err_t res = VPX_CODEC_OK;
   volatile vpx_enc_frame_flags_t flags = enc_flags;
   volatile vpx_codec_pts_t pts = pts_val;
@@ -1112,6 +1325,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
   if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
 
+  cpi->last_coded_width = ctx->oxcf.width;
+  cpi->last_coded_height = ctx->oxcf.height;
+
   if (img != NULL) {
     res = validate_img(ctx, img);
     if (res == VPX_CODEC_OK) {
@@ -1170,8 +1386,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts);
-    int64_t dst_end_time_stamp =
-        timebase_units_to_ticks(timestamp_ratio, pts + duration);
     size_t size, cx_data_sz;
     unsigned char *cx_data;
 
@@ -1182,6 +1396,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
+      const int64_t dst_end_time_stamp =
+          timebase_units_to_ticks(timestamp_ratio, pts + duration);
       res = image2yuvconfig(img, &sd);
 
       // Store the original flags in to the frame buffer. Will extract the
@@ -1218,6 +1434,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
       // compute first pass stats
       if (img) {
         int ret;
+        int64_t dst_end_time_stamp;
         vpx_codec_cx_pkt_t fps_pkt;
         ENCODE_FRAME_RESULT encode_frame_result;
         vp9_init_encode_frame_result(&encode_frame_result);
@@ -1243,6 +1460,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 #endif  // !CONFIG_REALTIME_ONLY
     } else {
       ENCODE_FRAME_RESULT encode_frame_result;
+      int64_t dst_end_time_stamp;
       vp9_init_encode_frame_result(&encode_frame_result);
       while (cx_data_sz >= ctx->cx_data_sz / 2 &&
              -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data,
@@ -1338,9 +1556,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
           cx_data += size;
           cx_data_sz -= size;
-          if (is_one_pass_cbr_svc(cpi) &&
-              (cpi->svc.spatial_layer_id ==
-               cpi->svc.number_spatial_layers - 1)) {
+          if (is_one_pass_svc(cpi) && (cpi->svc.spatial_layer_id ==
+                                       cpi->svc.number_spatial_layers - 1)) {
             // Encoded all spatial layers; exit loop.
             break;
           }
@@ -1442,13 +1659,9 @@ static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
 
   if (data) {
     vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
-    if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
-                         roi->delta_q, roi->delta_lf, roi->skip,
-                         roi->ref_frame)) {
-      return VPX_CODEC_OK;
-    }
-    return VPX_CODEC_INVALID_PARAM;
+    return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+                           roi->delta_q, roi->delta_lf, roi->skip,
+                           roi->ref_frame);
   }
   return VPX_CODEC_INVALID_PARAM;
 }
@@ -1486,9 +1699,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
   vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
 
   if (mode) {
-    const int res =
-        vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
-                              (VPX_SCALING)mode->v_scaling_mode);
+    const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode,
+                                          mode->v_scaling_mode);
     return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
   }
   return VPX_CODEC_INVALID_PARAM;
@@ -1572,6 +1784,7 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
       lc->scaling_factor_num = params->scaling_factor_num[sl];
       lc->scaling_factor_den = params->scaling_factor_den[sl];
       lc->speed = params->speed_per_layer[sl];
+      lc->loopfilter_ctrl = params->loopfilter_ctrl[sl];
     }
   }
 
@@ -1716,6 +1929,82 @@ static vpx_codec_err_t ctrl_set_postencode_drop(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_disable_overshoot_maxq_cbr(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  cpi->rc.disable_overshoot_maxq_cbr = data;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_disable_loopfilter(vpx_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  cpi->loopfilter_ctrl = data;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  vpx_rc_funcs_t funcs = *CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args);
+  VP9_COMP *cpi = ctx->cpi;
+  EXT_RATECTRL *ext_ratectrl = &cpi->ext_ratectrl;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  // TODO(angiebird): Check the possibility of this flag being set at pass == 1
+  if (oxcf->pass == 2) {
+    const FRAME_INFO *frame_info = &cpi->frame_info;
+    vpx_rc_config_t ratectrl_config;
+    vpx_codec_err_t codec_status;
+    memset(&ratectrl_config, 0, sizeof(ratectrl_config));
+
+    ratectrl_config.frame_width = frame_info->frame_width;
+    ratectrl_config.frame_height = frame_info->frame_height;
+    ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames;
+    ratectrl_config.max_gf_interval = oxcf->max_gf_interval;
+    ratectrl_config.min_gf_interval = oxcf->min_gf_interval;
+    // TODO(angiebird): Double check whether this is the proper way to set up
+    // target_bitrate and frame_rate.
+    ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000);
+    ratectrl_config.frame_rate_num = oxcf->g_timebase.den;
+    ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
+    ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
+    ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
+
+    if (oxcf->rc_mode == VPX_VBR) {
+      ratectrl_config.rc_mode = VPX_RC_VBR;
+    } else if (oxcf->rc_mode == VPX_Q) {
+      ratectrl_config.rc_mode = VPX_RC_QMODE;
+    } else if (oxcf->rc_mode == VPX_CQ) {
+      ratectrl_config.rc_mode = VPX_RC_CQ;
+    }
+
+    codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
+    if (codec_status != VPX_CODEC_OK) {
+      return codec_status;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_quantizer_one_pass(vpx_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const int qp = va_arg(args, int);
+  vpx_codec_enc_cfg_t *cfg = &ctx->cfg;
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  vpx_codec_err_t res;
+
+  if (qp < 0 || qp > 63) return VPX_CODEC_INVALID_PARAM;
+
+  cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp;
+  extra_cfg.aq_mode = 0;
+  cpi->fixed_qp_onepass = 1;
+
+  res = update_extra_cfg(ctx, &extra_cfg);
+  return res;
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -1760,16 +2049,23 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },
   { VP9E_SET_ROW_MT, ctrl_set_row_mt },
   { VP9E_SET_POSTENCODE_DROP, ctrl_set_postencode_drop },
+  { VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, ctrl_set_disable_overshoot_maxq_cbr },
   { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
   { VP9E_SET_SVC_INTER_LAYER_PRED, ctrl_set_svc_inter_layer_pred },
   { VP9E_SET_SVC_FRAME_DROP_LAYER, ctrl_set_svc_frame_drop_layer },
   { VP9E_SET_SVC_GF_TEMPORAL_REF, ctrl_set_svc_gf_temporal_ref },
   { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync },
   { VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv },
+  { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter },
+  { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
+  { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control },
+  { VP9E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass },
 
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
   { VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+  { VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, ctrl_get_quantizer_svc_layers },
+  { VP9E_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level },
   { VP9_GET_REFERENCE, ctrl_get_reference },
   { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id },
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
@@ -1804,8 +2100,8 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         0,   // rc_resize_allowed
         0,   // rc_scaled_width
         0,   // rc_scaled_height
-        60,  // rc_resize_down_thresold
-        30,  // rc_resize_up_thresold
+        60,  // rc_resize_down_thresh
+        30,  // rc_resize_up_thresh
 
         VPX_VBR,      // rc_end_usage
         { NULL, 0 },  // rc_twopass_stats_in
@@ -1832,14 +2128,30 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
         VPX_SS_DEFAULT_LAYERS,  // ss_number_layers
         { 0 },
-        { 0 },  // ss_target_bitrate
-        1,      // ts_number_layers
-        { 0 },  // ts_target_bitrate
-        { 0 },  // ts_rate_decimator
-        0,      // ts_periodicity
-        { 0 },  // ts_layer_id
-        { 0 },  // layer_taget_bitrate
-        0       // temporal_layering_mode
+        { 0 },     // ss_target_bitrate
+        1,         // ts_number_layers
+        { 0 },     // ts_target_bitrate
+        { 0 },     // ts_rate_decimator
+        0,         // ts_periodicity
+        { 0 },     // ts_layer_id
+        { 0 },     // layer_target_bitrate
+        0,         // temporal_layering_mode
+        0,         // use_vizier_rc_params
+        { 1, 1 },  // active_wq_factor
+        { 1, 1 },  // err_per_mb_factor
+        { 1, 1 },  // sr_default_decay_limit
+        { 1, 1 },  // sr_diff_factor
+        { 1, 1 },  // kf_err_per_mb_factor
+        { 1, 1 },  // kf_frame_min_boost_factor
+        { 1, 1 },  // kf_frame_max_boost_first_factor
+        { 1, 1 },  // kf_frame_max_boost_subs_factor
+        { 1, 1 },  // kf_max_total_boost_factor
+        { 1, 1 },  // gf_max_total_boost_factor
+        { 1, 1 },  // gf_frame_max_boost_factor
+        { 1, 1 },  // zm_factor
+        { 1, 1 },  // rd_mult_inter_qp_fac
+        { 1, 1 },  // rd_mult_arf_qp_fac
+        { 1, 1 },  // rd_mult_key_qp_fac
     } },
 };
 
@@ -1899,7 +2211,8 @@ static vp9_extracfg get_extra_cfg() {
 
 VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
                                         vpx_rational_t frame_rate,
-                                        int target_bitrate,
+                                        int target_bitrate, int encode_speed,
+                                        int target_level,
                                         vpx_enc_pass enc_pass) {
   /* This function will generate the same VP9EncoderConfig used by the
    * vpxenc command given below.
@@ -1910,6 +2223,8 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
    * HEIGHT:  frame_height
    * FPS:     frame_rate
    * BITRATE: target_bitrate
+   * CPU_USED:encode_speed
+   * TARGET_LEVEL: target_level
    *
    * INPUT, OUTPUT, LIMIT will not affect VP9EncoderConfig
    *
@@ -1921,15 +2236,17 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
    * BITRATE=600
    * FPS=30/1
    * LIMIT=150
+   * CPU_USED=0
+   * TARGET_LEVEL=0
    * ./vpxenc --limit=$LIMIT --width=$WIDTH --height=$HEIGHT --fps=$FPS
    * --lag-in-frames=25 \
-   *  --codec=vp9 --good --cpu-used=0 --threads=0 --profile=0 \
+   *  --codec=vp9 --good --cpu-used=CPU_USED --threads=0 --profile=0 \
    *  --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2 --kf-max-dist=150 \
    *  --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --bias-pct=50 \
    *  --minsection-pct=0 --maxsection-pct=150 --arnr-maxframes=7 --psnr \
    *  --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 \
    *  --frame-parallel=0 --tile-columns=0 --cpu-used=0 --end-usage=vbr \
-   *  --target-bitrate=$BITRATE -o $OUTPUT $INPUT
+   *  --target-bitrate=$BITRATE --target-level=0 -o $OUTPUT $INPUT
    */
 
   VP9EncoderConfig oxcf;
@@ -1946,49 +2263,51 @@ VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
   oxcf.tile_columns = 0;
   oxcf.frame_parallel_decoding_mode = 0;
   oxcf.two_pass_vbrmax_section = 150;
+  oxcf.speed = abs(encode_speed);
+  oxcf.target_level = target_level;
   return oxcf;
 }
 
-#define DUMP_STRUCT_VALUE(struct, value) \
-  printf(#value " %" PRId64 "\n", (int64_t)(struct)->value)
+#define DUMP_STRUCT_VALUE(fp, structure, value) \
+  fprintf(fp, #value " %" PRId64 "\n", (int64_t)(structure)->value)
 
-void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) {
-  DUMP_STRUCT_VALUE(oxcf, profile);
-  DUMP_STRUCT_VALUE(oxcf, bit_depth);
-  DUMP_STRUCT_VALUE(oxcf, width);
-  DUMP_STRUCT_VALUE(oxcf, height);
-  DUMP_STRUCT_VALUE(oxcf, input_bit_depth);
-  DUMP_STRUCT_VALUE(oxcf, init_framerate);
+void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) {
+  DUMP_STRUCT_VALUE(fp, oxcf, profile);
+  DUMP_STRUCT_VALUE(fp, oxcf, bit_depth);
+  DUMP_STRUCT_VALUE(fp, oxcf, width);
+  DUMP_STRUCT_VALUE(fp, oxcf, height);
+  DUMP_STRUCT_VALUE(fp, oxcf, input_bit_depth);
+  DUMP_STRUCT_VALUE(fp, oxcf, init_framerate);
   // TODO(angiebird): dump g_timebase
   // TODO(angiebird): dump g_timebase_in_ts
 
-  DUMP_STRUCT_VALUE(oxcf, target_bandwidth);
+  DUMP_STRUCT_VALUE(fp, oxcf, target_bandwidth);
 
-  DUMP_STRUCT_VALUE(oxcf, noise_sensitivity);
-  DUMP_STRUCT_VALUE(oxcf, sharpness);
-  DUMP_STRUCT_VALUE(oxcf, speed);
-  DUMP_STRUCT_VALUE(oxcf, rc_max_intra_bitrate_pct);
-  DUMP_STRUCT_VALUE(oxcf, rc_max_inter_bitrate_pct);
-  DUMP_STRUCT_VALUE(oxcf, gf_cbr_boost_pct);
+  DUMP_STRUCT_VALUE(fp, oxcf, noise_sensitivity);
+  DUMP_STRUCT_VALUE(fp, oxcf, sharpness);
+  DUMP_STRUCT_VALUE(fp, oxcf, speed);
+  DUMP_STRUCT_VALUE(fp, oxcf, rc_max_intra_bitrate_pct);
+  DUMP_STRUCT_VALUE(fp, oxcf, rc_max_inter_bitrate_pct);
+  DUMP_STRUCT_VALUE(fp, oxcf, gf_cbr_boost_pct);
 
-  DUMP_STRUCT_VALUE(oxcf, mode);
-  DUMP_STRUCT_VALUE(oxcf, pass);
+  DUMP_STRUCT_VALUE(fp, oxcf, mode);
+  DUMP_STRUCT_VALUE(fp, oxcf, pass);
 
   // Key Framing Operations
-  DUMP_STRUCT_VALUE(oxcf, auto_key);
-  DUMP_STRUCT_VALUE(oxcf, key_freq);
+  DUMP_STRUCT_VALUE(fp, oxcf, auto_key);
+  DUMP_STRUCT_VALUE(fp, oxcf, key_freq);
 
-  DUMP_STRUCT_VALUE(oxcf, lag_in_frames);
+  DUMP_STRUCT_VALUE(fp, oxcf, lag_in_frames);
 
   // ----------------------------------------------------------------
   // DATARATE CONTROL OPTIONS
 
   // vbr, cbr, constrained quality or constant quality
-  DUMP_STRUCT_VALUE(oxcf, rc_mode);
+  DUMP_STRUCT_VALUE(fp, oxcf, rc_mode);
 
   // buffer targeting aggressiveness
-  DUMP_STRUCT_VALUE(oxcf, under_shoot_pct);
-  DUMP_STRUCT_VALUE(oxcf, over_shoot_pct);
+  DUMP_STRUCT_VALUE(fp, oxcf, under_shoot_pct);
+  DUMP_STRUCT_VALUE(fp, oxcf, over_shoot_pct);
 
   // buffering parameters
   // TODO(angiebird): dump tarting_buffer_level_ms
@@ -1996,37 +2315,37 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) {
   // TODO(angiebird): dump maximum_buffer_size_ms
 
   // Frame drop threshold.
-  DUMP_STRUCT_VALUE(oxcf, drop_frames_water_mark);
+  DUMP_STRUCT_VALUE(fp, oxcf, drop_frames_water_mark);
 
   // controlling quality
-  DUMP_STRUCT_VALUE(oxcf, fixed_q);
-  DUMP_STRUCT_VALUE(oxcf, worst_allowed_q);
-  DUMP_STRUCT_VALUE(oxcf, best_allowed_q);
-  DUMP_STRUCT_VALUE(oxcf, cq_level);
-  DUMP_STRUCT_VALUE(oxcf, aq_mode);
+  DUMP_STRUCT_VALUE(fp, oxcf, fixed_q);
+  DUMP_STRUCT_VALUE(fp, oxcf, worst_allowed_q);
+  DUMP_STRUCT_VALUE(fp, oxcf, best_allowed_q);
+  DUMP_STRUCT_VALUE(fp, oxcf, cq_level);
+  DUMP_STRUCT_VALUE(fp, oxcf, aq_mode);
 
   // Special handling of Adaptive Quantization for AltRef frames
-  DUMP_STRUCT_VALUE(oxcf, alt_ref_aq);
+  DUMP_STRUCT_VALUE(fp, oxcf, alt_ref_aq);
 
   // Internal frame size scaling.
-  DUMP_STRUCT_VALUE(oxcf, resize_mode);
-  DUMP_STRUCT_VALUE(oxcf, scaled_frame_width);
-  DUMP_STRUCT_VALUE(oxcf, scaled_frame_height);
+  DUMP_STRUCT_VALUE(fp, oxcf, resize_mode);
+  DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_width);
+  DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_height);
 
   // Enable feature to reduce the frame quantization every x frames.
-  DUMP_STRUCT_VALUE(oxcf, frame_periodic_boost);
+  DUMP_STRUCT_VALUE(fp, oxcf, frame_periodic_boost);
 
   // two pass datarate control
-  DUMP_STRUCT_VALUE(oxcf, two_pass_vbrbias);
-  DUMP_STRUCT_VALUE(oxcf, two_pass_vbrmin_section);
-  DUMP_STRUCT_VALUE(oxcf, two_pass_vbrmax_section);
-  DUMP_STRUCT_VALUE(oxcf, vbr_corpus_complexity);
+  DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrbias);
+  DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmin_section);
+  DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmax_section);
+  DUMP_STRUCT_VALUE(fp, oxcf, vbr_corpus_complexity);
   // END DATARATE CONTROL OPTIONS
   // ----------------------------------------------------------------
 
   // Spatial and temporal scalability.
-  DUMP_STRUCT_VALUE(oxcf, ss_number_layers);
-  DUMP_STRUCT_VALUE(oxcf, ts_number_layers);
+  DUMP_STRUCT_VALUE(fp, oxcf, ss_number_layers);
+  DUMP_STRUCT_VALUE(fp, oxcf, ts_number_layers);
 
   // Bitrate allocation for spatial layers.
   // TODO(angiebird): dump layer_target_bitrate[VPX_MAX_LAYERS]
@@ -2034,45 +2353,42 @@ void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf) {
   // TODO(angiebird): dump ss_enable_auto_arf[VPX_SS_MAX_LAYERS]
   // TODO(angiebird): dump ts_rate_decimator[VPX_TS_MAX_LAYERS]
 
-  DUMP_STRUCT_VALUE(oxcf, enable_auto_arf);
-  DUMP_STRUCT_VALUE(oxcf, encode_breakout);
-  DUMP_STRUCT_VALUE(oxcf, error_resilient_mode);
-  DUMP_STRUCT_VALUE(oxcf, frame_parallel_decoding_mode);
+  DUMP_STRUCT_VALUE(fp, oxcf, enable_auto_arf);
+  DUMP_STRUCT_VALUE(fp, oxcf, encode_breakout);
+  DUMP_STRUCT_VALUE(fp, oxcf, error_resilient_mode);
+  DUMP_STRUCT_VALUE(fp, oxcf, frame_parallel_decoding_mode);
 
-  DUMP_STRUCT_VALUE(oxcf, arnr_max_frames);
-  DUMP_STRUCT_VALUE(oxcf, arnr_strength);
+  DUMP_STRUCT_VALUE(fp, oxcf, arnr_max_frames);
+  DUMP_STRUCT_VALUE(fp, oxcf, arnr_strength);
 
-  DUMP_STRUCT_VALUE(oxcf, min_gf_interval);
-  DUMP_STRUCT_VALUE(oxcf, max_gf_interval);
+  DUMP_STRUCT_VALUE(fp, oxcf, min_gf_interval);
+  DUMP_STRUCT_VALUE(fp, oxcf, max_gf_interval);
 
-  DUMP_STRUCT_VALUE(oxcf, tile_columns);
-  DUMP_STRUCT_VALUE(oxcf, tile_rows);
+  DUMP_STRUCT_VALUE(fp, oxcf, tile_columns);
+  DUMP_STRUCT_VALUE(fp, oxcf, tile_rows);
 
-  DUMP_STRUCT_VALUE(oxcf, enable_tpl_model);
+  DUMP_STRUCT_VALUE(fp, oxcf, enable_tpl_model);
 
-  DUMP_STRUCT_VALUE(oxcf, max_threads);
+  DUMP_STRUCT_VALUE(fp, oxcf, max_threads);
 
-  DUMP_STRUCT_VALUE(oxcf, target_level);
+  DUMP_STRUCT_VALUE(fp, oxcf, target_level);
 
   // TODO(angiebird): dump two_pass_stats_in
-
-#if CONFIG_FP_MB_STATS
-  // TODO(angiebird): dump firstpass_mb_stats_in
-#endif
-
-  DUMP_STRUCT_VALUE(oxcf, tuning);
-  DUMP_STRUCT_VALUE(oxcf, content);
+  DUMP_STRUCT_VALUE(fp, oxcf, tuning);
+  DUMP_STRUCT_VALUE(fp, oxcf, content);
 #if CONFIG_VP9_HIGHBITDEPTH
-  DUMP_STRUCT_VALUE(oxcf, use_highbitdepth);
+  DUMP_STRUCT_VALUE(fp, oxcf, use_highbitdepth);
 #endif
-  DUMP_STRUCT_VALUE(oxcf, color_space);
-  DUMP_STRUCT_VALUE(oxcf, color_range);
-  DUMP_STRUCT_VALUE(oxcf, render_width);
-  DUMP_STRUCT_VALUE(oxcf, render_height);
-  DUMP_STRUCT_VALUE(oxcf, temporal_layering_mode);
-
-  DUMP_STRUCT_VALUE(oxcf, row_mt);
-  DUMP_STRUCT_VALUE(oxcf, motion_vector_unit_test);
+  DUMP_STRUCT_VALUE(fp, oxcf, color_space);
+  DUMP_STRUCT_VALUE(fp, oxcf, color_range);
+  DUMP_STRUCT_VALUE(fp, oxcf, render_width);
+  DUMP_STRUCT_VALUE(fp, oxcf, render_height);
+  DUMP_STRUCT_VALUE(fp, oxcf, temporal_layering_mode);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, row_mt);
+  DUMP_STRUCT_VALUE(fp, oxcf, motion_vector_unit_test);
+  DUMP_STRUCT_VALUE(fp, oxcf, delta_q_uv);
+  DUMP_STRUCT_VALUE(fp, oxcf, use_simple_encode_api);
 }
 
 FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf) {
diff --git a/vp9/vp9_cx_iface.h b/vp9/vp9_cx_iface.h
index 08569fcc96..f2de8507ff 100644
--- a/vp9/vp9_cx_iface.h
+++ b/vp9/vp9_cx_iface.h
@@ -19,10 +19,11 @@ extern "C" {
 
 VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
                                         vpx_rational_t frame_rate,
-                                        int target_bitrate,
+                                        int target_bitrate, int encode_speed,
+                                        int target_level,
                                         vpx_enc_pass enc_pass);
 
-void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf);
+void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp);
 
 FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf);
 
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 35ecbaff37..860f721dc5 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -201,7 +201,7 @@ static vpx_codec_err_t update_error_state(
   return error->error_code;
 }
 
-static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
+static vpx_codec_err_t init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
   VP9_COMMON *const cm = &ctx->pbi->common;
   BufferPool *const pool = cm->buffer_pool;
 
@@ -217,12 +217,16 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
     pool->get_fb_cb = vp9_get_frame_buffer;
     pool->release_fb_cb = vp9_release_frame_buffer;
 
-    if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+    if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) {
       vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                          "Failed to initialize internal frame buffers");
+      return VPX_CODEC_MEM_ERROR;
+    }
 
     pool->cb_priv = &pool->int_frame_buffers;
   }
+
+  return VPX_CODEC_OK;
 }
 
 static void set_default_ppflags(vp8_postproc_cfg_t *cfg) {
@@ -252,6 +256,7 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) {
   } while (0)
 
 static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  vpx_codec_err_t res;
   ctx->last_show_frame = -1;
   ctx->need_resync = 1;
   ctx->flushed = 0;
@@ -261,6 +266,8 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
 
   ctx->pbi = vp9_decoder_create(ctx->buffer_pool);
   if (ctx->pbi == NULL) {
+    vpx_free(ctx->buffer_pool);
+    ctx->buffer_pool = NULL;
     set_error_detail(ctx, "Failed to allocate decoder");
     return VPX_CODEC_MEM_ERROR;
   }
@@ -278,9 +285,14 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
   if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
     set_default_ppflags(&ctx->postproc_cfg);
 
-  init_buffer_callbacks(ctx);
-
-  return VPX_CODEC_OK;
+  res = init_buffer_callbacks(ctx);
+  if (res != VPX_CODEC_OK) {
+    vpx_free(ctx->buffer_pool);
+    ctx->buffer_pool = NULL;
+    vp9_decoder_remove(ctx->pbi);
+    ctx->pbi = NULL;
+  }
+  return res;
 }
 
 static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
@@ -293,9 +305,7 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
 
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
-                                  void *user_priv, int64_t deadline) {
-  (void)deadline;
-
+                                  void *user_priv) {
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
@@ -330,9 +340,8 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
 
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
-                                      void *user_priv, long deadline) {
+                                      void *user_priv) {
   const uint8_t *data_start = data;
-  const uint8_t *const data_end = data + data_sz;
   vpx_codec_err_t res;
   uint32_t frame_sizes[8];
   int frame_count;
@@ -347,7 +356,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
 
   // Initialize the decoder on the first frame.
   if (ctx->pbi == NULL) {
-    const vpx_codec_err_t res = init_decoder(ctx);
+    res = init_decoder(ctx);
     if (res != VPX_CODEC_OK) return res;
   }
 
@@ -360,27 +369,27 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
 
   // Decode in serial mode.
   if (frame_count > 0) {
+    const uint8_t *const data_end = data + data_sz;
     int i;
 
     for (i = 0; i < frame_count; ++i) {
       const uint8_t *data_start_copy = data_start;
       const uint32_t frame_size = frame_sizes[i];
-      vpx_codec_err_t res;
       if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) {
         set_error_detail(ctx, "Invalid frame size in index");
         return VPX_CODEC_CORRUPT_FRAME;
       }
 
-      res = decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
+      res = decode_one(ctx, &data_start_copy, frame_size, user_priv);
       if (res != VPX_CODEC_OK) return res;
 
       data_start += frame_size;
     }
   } else {
+    const uint8_t *const data_end = data + data_sz;
     while (data_start < data_end) {
       const uint32_t frame_size = (uint32_t)(data_end - data_start);
-      const vpx_codec_err_t res =
-          decode_one(ctx, &data_start, frame_size, user_priv, deadline);
+      res = decode_one(ctx, &data_start, frame_size, user_priv);
       if (res != VPX_CODEC_OK) return res;
 
       // Account for suboptimal termination by the encoder.
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 666f228827..44790ef6a4 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -40,6 +40,7 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
 VP9_CX_SRCS-yes += encoder/vp9_extend.h
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
+VP9_CX_SRCS-yes += encoder/vp9_firstpass_stats.h
 VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c
 VP9_CX_SRCS-yes += encoder/vp9_job_queue.h
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
@@ -96,35 +97,40 @@ VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c
 VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h
 VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.c
 VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.h
+VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.c
+VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
 endif
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
-VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/vp9_temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
 endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 
-ifeq ($(VPX_ARCH_X86_64),yes)
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
-endif
-
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
@@ -133,11 +139,12 @@ endif
 
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
 
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
-endif
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_error_neon.c
+endif
 
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
 
@@ -155,8 +162,10 @@ VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
-VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter_constants.h
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c
diff --git a/vpx/exports_com b/vpx/exports_com
index 2ab05099f8..f0b46aa175 100644
--- a/vpx/exports_com
+++ b/vpx/exports_com
@@ -14,3 +14,6 @@ text vpx_img_flip
 text vpx_img_free
 text vpx_img_set_rect
 text vpx_img_wrap
+text vpx_free_tpl_gop_stats
+text vpx_read_tpl_gop_stats
+text vpx_write_tpl_gop_stats
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 4ef93057f5..275b6a436c 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -48,6 +48,8 @@
 #include "../vpx_encoder.h"
 #include <stdarg.h>
 
+#include "vpx_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -196,8 +198,7 @@ typedef const struct vpx_codec_ctrl_fn_map {
 typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  const uint8_t *data,
                                                  unsigned int data_sz,
-                                                 void *user_priv,
-                                                 long deadline);
+                                                 void *user_priv);
 
 /*!\brief Decoded frames iterator
  *
@@ -253,7 +254,7 @@ typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  vpx_codec_pts_t pts,
                                                  unsigned long duration,
                                                  vpx_enc_frame_flags_t flags,
-                                                 unsigned long deadline);
+                                                 vpx_enc_deadline_t deadline);
 typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)(
     vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter);
 
@@ -283,7 +284,7 @@ typedef const struct vpx_codec_enc_cfg_map {
   vpx_codec_enc_cfg_t cfg;
 } vpx_codec_enc_cfg_map_t;
 
-/*!\brief Decoder algorithm interface interface
+/*!\brief Decoder algorithm interface
  *
  * All decoders \ref MUST expose a variable of this type.
  */
@@ -427,6 +428,27 @@ struct vpx_internal_error_info {
   jmp_buf jmp;
 };
 
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(error, lval, expr)                                  \
+  do {                                                                      \
+    assert((error)->setjmp);                                                \
+    (lval) = (expr);                                                        \
+    if (!(lval))                                                            \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,                        \
+                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
+                         __LINE__);                                         \
+  } while (0)
+#else
+#define CHECK_MEM_ERROR(error, lval, expr)             \
+  do {                                                 \
+    assert((error)->setjmp);                           \
+    (lval) = (expr);                                   \
+    if (!(lval))                                       \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,   \
+                         "Failed to allocate " #lval); \
+  } while (0)
+#endif
+
 #define CLANG_ANALYZER_NORETURN
 #if defined(__has_feature)
 #if __has_feature(attribute_analyzer_noreturn)
@@ -435,9 +457,21 @@ struct vpx_internal_error_info {
 #endif
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef LIBVPX_FORMAT_PRINTF
+#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 void vpx_internal_error(struct vpx_internal_error_info *info,
-                        vpx_codec_err_t error, const char *fmt,
-                        ...) CLANG_ANALYZER_NORETURN;
+                        vpx_codec_err_t error, const char *fmt, ...)
+    LIBVPX_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN;
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h
new file mode 100644
index 0000000000..01d64b14b7
--- /dev/null
+++ b/vpx/internal/vpx_ratectrl_rtc.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
+#define VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
+
+#include "vpx/vpx_encoder.h"
+
+namespace libvpx {
+
+enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 };
+
+enum class FrameDropDecision {
+  kOk,    // Frame is encoded.
+  kDrop,  // Frame is dropped.
+};
+
+struct VpxRateControlRtcConfig {
+ public:
+  VpxRateControlRtcConfig() {
+    width = 1280;
+    height = 720;
+    max_quantizer = 63;
+    min_quantizer = 2;
+    target_bandwidth = 1000;
+    buf_initial_sz = 600;
+    buf_optimal_sz = 600;
+    buf_sz = 1000;
+    undershoot_pct = overshoot_pct = 50;
+    max_intra_bitrate_pct = 50;
+    max_inter_bitrate_pct = 0;
+    framerate = 30.0;
+    ts_number_layers = 1;
+    rc_mode = VPX_CBR;
+    aq_mode = 0;
+    layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
+    ts_rate_decimator[0] = 1;
+    frame_drop_thresh = 0;
+    is_screen = false;
+  }
+
+  int width;
+  int height;
+  // 0-63
+  int max_quantizer;
+  int min_quantizer;
+  int64_t target_bandwidth;
+  int64_t buf_initial_sz;
+  int64_t buf_optimal_sz;
+  int64_t buf_sz;
+  int undershoot_pct;
+  int overshoot_pct;
+  int max_intra_bitrate_pct;
+  int max_inter_bitrate_pct;
+  double framerate;
+  // Number of temporal layers
+  int ts_number_layers;
+  int layer_target_bitrate[VPX_MAX_LAYERS];
+  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+  // vbr, cbr
+  enum vpx_rc_mode rc_mode;
+  int aq_mode;
+  int frame_drop_thresh;
+  bool is_screen;
+};
+}  // namespace libvpx
+#endif  // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c
index 114b94e194..24528d860a 100644
--- a/vpx/src/vpx_codec.c
+++ b/vpx/src/vpx_codec.c
@@ -50,12 +50,12 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err) {
   return "Unrecognized error code";
 }
 
-const char *vpx_codec_error(vpx_codec_ctx_t *ctx) {
+const char *vpx_codec_error(const vpx_codec_ctx_t *ctx) {
   return (ctx) ? vpx_codec_err_to_string(ctx->err)
                : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM);
 }
 
-const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) {
+const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx) {
   if (ctx && ctx->err)
     return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
 
@@ -82,7 +82,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) {
 }
 
 vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) {
-  return (iface) ? iface->caps : 0;
+  return iface ? iface->caps : 0;
 }
 
 vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) {
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 427cd1bf43..c79cc708cd 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -105,6 +105,7 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
                                  unsigned int data_sz, void *user_priv,
                                  long deadline) {
   vpx_codec_err_t res;
+  (void)deadline;
 
   /* Sanity checks */
   /* NULL data ptr allowed if data_sz is 0 too */
@@ -112,10 +113,8 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
     res = VPX_CODEC_INVALID_PARAM;
   else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
-  else {
-    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
-                                 deadline);
-  }
+  else
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv);
 
   return SAVE_STATUS(ctx, res);
 }
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index f636b54a33..017525aeee 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -54,6 +54,10 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
     res = ctx->iface->init(ctx, NULL);
 
     if (res) {
+      // IMPORTANT: ctx->priv->err_detail must be null or point to a string
+      // that remains valid after ctx->priv is destroyed, such as a C string
+      // literal. This makes it safe to call vpx_codec_error_detail() after
+      // vpx_codec_enc_init_ver() failed.
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
@@ -173,7 +177,7 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
 #include "vpx_ports/x86.h"
 #define FLOATING_POINT_INIT() \
   do {                        \
-    unsigned short x87_orig_mode = x87_set_double_precision();
+  unsigned short x87_orig_mode = x87_set_double_precision()
 #define FLOATING_POINT_RESTORE()       \
   x87_set_control_word(x87_orig_mode); \
   }                                    \
@@ -187,7 +191,7 @@ static void FLOATING_POINT_RESTORE() {}
 vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
                                  vpx_codec_pts_t pts, unsigned long duration,
                                  vpx_enc_frame_flags_t flags,
-                                 unsigned long deadline) {
+                                 vpx_enc_deadline_t deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (!ctx || (img && !duration))
diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index ff496b5d34..f9f0dd6025 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -22,7 +23,9 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
                                      unsigned char *img_data) {
   unsigned int h, w, s, xcs, ycs, bps;
   unsigned int stride_in_bytes;
-  int align;
+  unsigned int align;
+
+  if (img != NULL) memset(img, 0, sizeof(vpx_image_t));
 
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
@@ -88,8 +91,6 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
     if (!img) goto fail;
 
     img->self_allocd = 1;
-  } else {
-    memset(img, 0, sizeof(vpx_image_t));
   }
 
   img->img_data = img_data;
@@ -152,9 +153,8 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
 
 int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h) {
-  unsigned char *data;
-
-  if (x + w <= img->w && y + h <= img->h) {
+  if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h &&
+      y + h <= img->h) {
     img->d_w = w;
     img->d_h = h;
 
@@ -165,7 +165,7 @@ int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
     } else {
       const int bytes_per_sample =
           (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
-      data = img->img_data;
+      unsigned char *data = img->img_data;
 
       if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) {
         img->planes[VPX_PLANE_ALPHA] =
diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c
new file mode 100644
index 0000000000..62c2a9c857
--- /dev/null
+++ b/vpx/src/vpx_tpl.c
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_tpl.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define CHECK_FPRINTF_ERROR(expr) \
+  do {                            \
+    if (expr < 0) {               \
+      return VPX_CODEC_ERROR;     \
+    }                             \
+  } while (0)
+
+#define CHECK_FSCANF_ERROR(expr, expected_value) \
+  do {                                           \
+    if (expr != expected_value) {                \
+      return VPX_CODEC_ERROR;                    \
+    }                                            \
+  } while (0)
+
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+                                        const VpxTplGopStats *tpl_gop_stats) {
+  int i;
+  if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+  CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size));
+
+  for (i = 0; i < tpl_gop_stats->size; i++) {
+    VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i];
+    const int num_blocks = frame_stats.num_blocks;
+    int block;
+    CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width,
+                                frame_stats.frame_height, num_blocks));
+    for (block = 0; block < num_blocks; block++) {
+      VpxTplBlockStats block_stats = frame_stats.block_stats_list[block];
+      CHECK_FPRINTF_ERROR(
+          fprintf(tpl_file,
+                  "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
+                  " %" PRId64 " %d\n",
+                  block_stats.inter_cost, block_stats.intra_cost,
+                  block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
+                  block_stats.recrf_rate, block_stats.ref_frame_index));
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+                                       VpxTplGopStats *tpl_gop_stats) {
+  int i, frame_list_size;
+  if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+  CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1);
+  tpl_gop_stats->size = frame_list_size;
+  tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc(
+      frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0]));
+  if (tpl_gop_stats->frame_stats_list == NULL) {
+    return VPX_CODEC_MEM_ERROR;
+  }
+  for (i = 0; i < frame_list_size; i++) {
+    VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+    int num_blocks, width, height, block;
+    CHECK_FSCANF_ERROR(
+        fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3);
+    frame_stats->num_blocks = num_blocks;
+    frame_stats->frame_width = width;
+    frame_stats->frame_height = height;
+    frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc(
+        num_blocks, sizeof(frame_stats->block_stats_list[0]));
+    if (frame_stats->block_stats_list == NULL) {
+      vpx_free_tpl_gop_stats(tpl_gop_stats);
+      return VPX_CODEC_MEM_ERROR;
+    }
+    for (block = 0; block < num_blocks; block++) {
+      VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block];
+      CHECK_FSCANF_ERROR(
+          fscanf(tpl_file,
+                 "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64
+                 " %" SCNd64 " %d\n",
+                 &block_stats->inter_cost, &block_stats->intra_cost,
+                 &block_stats->mv_c, &block_stats->mv_r,
+                 &block_stats->recrf_dist, &block_stats->recrf_rate,
+                 &block_stats->ref_frame_index),
+          7);
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats) {
+  int frame;
+  if (tpl_gop_stats == NULL) return;
+  for (frame = 0; frame < tpl_gop_stats->size; frame++) {
+    vpx_free(tpl_gop_stats->frame_stats_list[frame].block_stats_list);
+  }
+  vpx_free(tpl_gop_stats->frame_stats_list);
+}
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index dcdd710c0b..2875e185e6 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -17,6 +17,7 @@
  */
 #include "./vp8.h"
 #include "./vpx_encoder.h"
+#include "./vpx_ext_ratectrl.h"
 
 /*!\file
  * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the
@@ -32,7 +33,15 @@ extern "C" {
  * This interface provides the capability to encode raw VP8 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP8 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp8_cx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp8_cx_algo;
+
+/*!\brief The interface to the VP8 encoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -41,7 +50,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
  * This interface provides the capability to encode raw VP9 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP9 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp9_cx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp9_cx_algo;
+
+/*!\brief The interface to the VP9 encoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -149,6 +166,7 @@ enum vp8e_enc_control_id {
    *
    * \note Valid range for VP8: -16..16
    * \note Valid range for VP9: -9..9
+   * \note A negative value (-n) is treated as its absolute value (n) in VP9.
    *
    * Supported in codecs: VP8, VP9
    */
@@ -285,7 +303,7 @@ enum vp8e_enc_control_id {
    * the feature is off, i.e., no golden frame boost in CBR mode and
    * average bitrate target is used.
    *
-   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * For example, to allow 100% more bits, i.e., 2X, in a golden frame
    * than average frame, set this to 100.
    *
    * Supported in codecs: VP9
@@ -493,25 +511,13 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_COLOR_SPACE,
 
-  /*!\brief Codec control function to set temporal layering mode.
-   * \note Valid ranges: 0..3, default is "0"
-   * (VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING).
-   *                     0 = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING
-   *                     1 = VP9E_TEMPORAL_LAYERING_MODE_BYPASS
-   *                     2 = VP9E_TEMPORAL_LAYERING_MODE_0101
-   *                     3 = VP9E_TEMPORAL_LAYERING_MODE_0212
-   *
-   * Supported in codecs: VP9
-   */
-  VP9E_SET_TEMPORAL_LAYERING_MODE,
-
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *
    * By default the value is set as 4.
    *
    * Supported in codecs: VP9
    */
-  VP9E_SET_MIN_GF_INTERVAL,
+  VP9E_SET_MIN_GF_INTERVAL = 48,
 
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *
@@ -593,7 +599,7 @@ enum vp8e_enc_control_id {
    * the feature is off, i.e., no golden frame boost in CBR mode and
    * average bitrate target is used.
    *
-   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * For example, to allow 100% more bits, i.e., 2X, in a golden frame
    * than average frame, set this to 100.
    *
    * Supported in codecs: VP8
@@ -684,6 +690,84 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP9
    */
   VP9E_SET_DELTA_Q_UV,
+
+  /*!\brief Codec control function to disable increase Q on overshoot in CBR.
+   *
+   * 0: On (default), 1: Disable.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR,
+
+  /*!\brief Codec control function to disable loopfilter.
+   *
+   * 0: Loopfilter on all frames, 1: Disable on non reference frames.
+   * 2: Disable on all frames.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_DISABLE_LOOPFILTER,
+
+  /*!\brief Codec control function to enable external rate control library.
+   *
+   * args[0]: path of the rate control library
+   *
+   * args[1]: private config of the rate control library
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_EXTERNAL_RATE_CONTROL,
+
+  /*!\brief Codec control to disable internal features in rate control.
+   *
+   * This will do 3 things, only for 1 pass:
+   *  - Turn off low motion computation
+   *  - Turn off gf update constraint on key frame frequency
+   *  - Turn off content mode for cyclic refresh
+   *
+   * With those, the rate control is expected to work exactly the same as the
+   * interface provided in ratectrl_rtc.cc/h
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_RTC_EXTERNAL_RATECTRL,
+
+  /*!\brief Codec control function to get loopfilter level in the encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LOOPFILTER_LEVEL,
+
+  /*!\brief Codec control to get last quantizers for all spatial layers.
+   *
+   * Return value uses an array of internal quantizers scale defined by the
+   * codec, for all spatial layers.
+   * The size of the array passed in should be #VPX_SS_MAX_LAYERS.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LAST_QUANTIZER_SVC_LAYERS,
+
+  /*!\brief Codec control to disable internal features in rate control.
+   *
+   * This will turn off cyclic refresh for vp8.
+   *
+   * With this, the rate control is expected to work exactly the same as the
+   * interface provided in vp8_ratectrl_rtc.cc/h
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_RTC_EXTERNAL_RATECTRL,
+
+  /*!\brief Codec control to set quantizer for the next frame.
+   *
+   * This will turn off cyclic refresh. Only applicable to 1-pass without
+   * spatial layers.
+   *
+   * Supported in codecs: VP9
+   *
+   */
+  VP9E_SET_QUANTIZER_ONE_PASS,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -739,8 +823,8 @@ typedef struct vpx_roi_map {
   unsigned int rows; /**< Number of rows. */
   unsigned int cols; /**< Number of columns. */
   /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */
-  int delta_q[8];  /**< Quantizer deltas. */
-  int delta_lf[8]; /**< Loop filter deltas. */
+  int delta_q[8];  /**< Quantizer deltas. Valid range: [-63, 63].*/
+  int delta_lf[8]; /**< Loop filter deltas. Valid range: [-63, 63].*/
   /*! skip and ref frame segment is only used in VP9. */
   int skip[8];      /**< Skip this block. */
   int ref_frame[8]; /**< Reference frame for this block. */
@@ -883,28 +967,12 @@ typedef struct vpx_svc_spatial_layer_sync {
  *
  */
 
-VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
-#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
-VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
-#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *)
 #define VPX_CTRL_VP8E_SET_ROI_MAP
-VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
-#define VPX_CTRL_VP9E_SET_ROI_MAP
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP8E_SET_ACTIVEMAP
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *)
 #define VPX_CTRL_VP8E_SET_SCALEMODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int)
-#define VPX_CTRL_VP9E_SET_SVC
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *)
-#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
-VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
-#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
-#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
-
 VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int)
 #define VPX_CTRL_VP8E_SET_CPUUSED
 VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int)
@@ -917,7 +985,10 @@ VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD, unsigned int)
 #define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD
 VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */
 #define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS
-
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int)
 #define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int)
@@ -928,111 +999,105 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vp8e_tuning */
 #define VPX_CTRL_VP8E_SET_TUNING
 VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int)
 #define VPX_CTRL_VP8E_SET_CQ_LEVEL
-
-VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int)
-#define VPX_CTRL_VP9E_SET_TILE_COLUMNS
-VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int)
-#define VPX_CTRL_VP9E_SET_TILE_ROWS
-
-VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int)
-#define VPX_CTRL_VP9E_SET_TPL
-
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
-#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
-#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
-VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
-#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
-
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 #define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT
+VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
+#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
 #define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT
-
-VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
-#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
-
-VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
-#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
 #define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT
-
+VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
+#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
+VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
+#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
 VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
 #define VPX_CTRL_VP9E_SET_LOSSLESS
-
+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int)
+#define VPX_CTRL_VP9E_SET_TILE_COLUMNS
+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int)
+#define VPX_CTRL_VP9E_SET_TILE_ROWS
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING
-
 VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int)
 #define VPX_CTRL_VP9E_SET_AQ_MODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int)
-#define VPX_CTRL_VP9E_SET_ALT_REF_AQ
-
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST
-
 VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int)
 #define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY
-
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int)
+#define VPX_CTRL_VP9E_SET_SVC
+VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
+#define VPX_CTRL_VP9E_SET_ROI_MAP
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *)
+#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
 #define VPX_CTRL_VP9E_SET_TUNE_CONTENT
-
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
+VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
+#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_SPACE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
-
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
-
 VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP9E_GET_ACTIVEMAP
-
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_RANGE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_RANGE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 #define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG
-
 VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 #define VPX_CTRL_VP9E_SET_RENDER_SIZE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL
-
 VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int)
 #define VPX_CTRL_VP9E_SET_ROW_MT
-
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 #define VPX_CTRL_VP9E_GET_LEVEL
-
+VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int)
+#define VPX_CTRL_VP9E_SET_ALT_REF_AQ
+VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
 VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int)
 #define VPX_CTRL_VP9E_SET_SVC_INTER_LAYER_PRED
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *)
 #define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER
-
 VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 #define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_GF_TEMPORAL_REF, unsigned int)
 #define VPX_CTRL_VP9E_SET_SVC_GF_TEMPORAL_REF
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC,
                   vpx_svc_spatial_layer_sync_t *)
 #define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC
-
+VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int)
+#define VPX_CTRL_VP9E_SET_TPL
 VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int)
 #define VPX_CTRL_VP9E_SET_POSTENCODE_DROP
-
 VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int)
 #define VPX_CTRL_VP9E_SET_DELTA_Q_UV
+VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int)
+#define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR
+VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int)
+#define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER
+VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *)
+#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL
+VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL
+VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *)
+#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL
+VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
+#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
+VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
+VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int)
+#define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index af92f21ae3..8c13649f4a 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -32,7 +32,15 @@ extern "C" {
  * This interface provides the capability to decode VP8 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP8 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp8_dx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp8_dx_algo;
+
+/*!\brief The interface to the VP8 decoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -41,7 +49,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
  * This interface provides the capability to decode VP9 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP9 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp9_dx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp9_dx_algo;
+
+/*!\brief The interface to the VP9 decoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -177,28 +193,30 @@ VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *)
 #define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *)
 #define VPX_CTRL_VP8D_GET_LAST_REF_USED
-VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *)
-#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VPXD_SET_DECRYPTOR
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VP8D_SET_DECRYPTOR
+VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
+#define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_DISPLAY_SIZE
 VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *)
 #define VPX_CTRL_VP9D_GET_BIT_DEPTH
-VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
-#define VPX_CTRL_VP9D_GET_FRAME_SIZE
+VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int)
+#define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
-#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
-VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
-#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
 VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
-#define VPX_CTRL_VP9_DECODE_SET_ROW_MT
+#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
+VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
+#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
+VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *)
+#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int)
-#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT
+#define VPX_CTRL_VP9_DECODE_SET_ROW_MT
 VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int)
+#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index b0a931e019..0d61b07381 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -318,19 +318,21 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err);
  * \param[in]    ctx     Pointer to this instance's context.
  *
  */
-const char *vpx_codec_error(vpx_codec_ctx_t *ctx);
+const char *vpx_codec_error(const vpx_codec_ctx_t *ctx);
 
 /*!\brief Retrieve detailed error information for codec context
  *
  * Returns a human readable string providing detailed information about
- * the last error.
+ * the last error. The returned string is only valid until the next
+ * vpx_codec_* function call (except vpx_codec_error and
+ * vpx_codec_error_detail) on the codec context.
  *
  * \param[in]    ctx     Pointer to this instance's context.
  *
  * \retval NULL
  *     No detailed information is available.
  */
-const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
+const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx);
 
 /* REQUIRED FUNCTIONS
  *
@@ -345,9 +347,11 @@ const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
  * \param[in] ctx   Pointer to this instance's context
  *
  * \retval #VPX_CODEC_OK
- *     The codec algorithm initialized.
- * \retval #VPX_CODEC_MEM_ERROR
- *     Memory allocation failed.
+ *     The codec instance has been destroyed.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     ctx is a null pointer.
+ * \retval #VPX_CODEC_ERROR
+ *     Codec context not initialized.
  */
 vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx);
 
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index 4ed77ad6d9..25c815ef51 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -24,18 +24,24 @@ API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
 API_DOC_SRCS-yes += vpx_codec.h
 API_DOC_SRCS-yes += vpx_decoder.h
 API_DOC_SRCS-yes += vpx_encoder.h
+API_DOC_SRCS-yes += vpx_ext_ratectrl.h
 API_DOC_SRCS-yes += vpx_frame_buffer.h
 API_DOC_SRCS-yes += vpx_image.h
+API_DOC_SRCS-yes += vpx_tpl.h
 
 API_SRCS-yes += src/vpx_decoder.c
 API_SRCS-yes += vpx_decoder.h
 API_SRCS-yes += src/vpx_encoder.c
 API_SRCS-yes += vpx_encoder.h
 API_SRCS-yes += internal/vpx_codec_internal.h
+API_SRCS-yes += internal/vpx_ratectrl_rtc.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
+API_SRCS-yes += src/vpx_tpl.c
 API_SRCS-yes += vpx_codec.h
 API_SRCS-yes += vpx_codec.mk
 API_SRCS-yes += vpx_frame_buffer.h
 API_SRCS-yes += vpx_image.h
 API_SRCS-yes += vpx_integer.h
+API_SRCS-yes += vpx_ext_ratectrl.h
+API_SRCS-yes += vpx_tpl.h
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 39e5f585f6..0e9611e31f 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -29,7 +29,7 @@
 extern "C" {
 #endif
 
-#include "./vpx_codec.h"
+#include "./vpx_codec.h"  // IWYU pragma: export
 #include "./vpx_frame_buffer.h"
 
 /*!\brief Current ABI version number
@@ -127,7 +127,7 @@ typedef struct vpx_codec_dec_cfg {
  * \param[in]    ver     ABI version number. Must be set to
  *                       VPX_DECODER_ABI_VERSION
  * \retval #VPX_CODEC_OK
- *     The decoder algorithm initialized.
+ *     The decoder algorithm has been initialized.
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory allocation failed.
  */
@@ -205,6 +205,8 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
  *                         this frame.
  * \param[in] deadline     Soft deadline the decoder should attempt to meet,
  *                         in us. Set to zero for unlimited.
+ *                         NOTE: The deadline parameter is ignored. Always
+ *                         pass 0.
  *
  * \return Returns #VPX_CODEC_OK if the coded data was processed completely
  *         and future pictures can be decoded without error. Otherwise,
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index c84d40f7f7..18e3862bd7 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -29,7 +29,9 @@
 extern "C" {
 #endif
 
-#include "./vpx_codec.h"
+#include "./vpx_codec.h"  // IWYU pragma: export
+#include "./vpx_ext_ratectrl.h"
+#include "./vpx_tpl.h"
 
 /*! Temporal Scalability: Maximum length of the sequence defining frame
  * layer membership
@@ -56,8 +58,9 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_ENCODER_ABI_VERSION \
-  (14 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION                                \
+  (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
+   VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -113,14 +116,14 @@ typedef int64_t vpx_codec_pts_t;
  * support frame types that are codec specific (MPEG-1 D-frames for example)
  */
 typedef uint32_t vpx_codec_frame_flags_t;
-#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+#define VPX_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */
 /*!\brief frame can be dropped without affecting the stream (no future frame
  * depends on this one) */
-#define VPX_FRAME_IS_DROPPABLE 0x2
+#define VPX_FRAME_IS_DROPPABLE 0x2u
 /*!\brief frame should be decoded but will not be shown */
-#define VPX_FRAME_IS_INVISIBLE 0x4
+#define VPX_FRAME_IS_INVISIBLE 0x4u
 /*!\brief this is a fragment of the encoded frame */
-#define VPX_FRAME_IS_FRAGMENT 0x8
+#define VPX_FRAME_IS_FRAGMENT 0x8u
 
 /*!\brief Error Resilient flags
  *
@@ -130,12 +133,13 @@ typedef uint32_t vpx_codec_frame_flags_t;
  */
 typedef uint32_t vpx_codec_er_flags_t;
 /*!\brief Improve resiliency against losses of whole frames */
-#define VPX_ERROR_RESILIENT_DEFAULT 0x1
+#define VPX_ERROR_RESILIENT_DEFAULT 0x1u
 /*!\brief The frame partitions are independently decodable by the bool decoder,
  * meaning that partitions can be decoded even though earlier partitions have
  * been lost. Note that intra prediction is still done over the partition
- * boundary. */
-#define VPX_ERROR_RESILIENT_PARTITIONS 0x2
+ * boundary.
+ * \note This is only supported by VP8.*/
+#define VPX_ERROR_RESILIENT_PARTITIONS 0x2u
 
 /*!\brief Encoder output packet variants
  *
@@ -455,7 +459,7 @@ typedef struct vpx_codec_enc_cfg {
 
   /*!\brief Target data rate
    *
-   * Target bandwidth to use for this stream, in kilobits per second.
+   * Target bitrate to use for this stream, in kilobits per second.
    */
   unsigned int rc_target_bitrate;
 
@@ -496,7 +500,7 @@ typedef struct vpx_codec_enc_cfg {
    * undershoot level (current rate vs target) beyond which more aggressive
    * corrective measures are taken.
    *   *
-   * Valid values in the range VP8:0-1000 VP9: 0-100.
+   * Valid values in the range VP8:0-100 VP9: 0-100.
    */
   unsigned int rc_undershoot_pct;
 
@@ -511,7 +515,7 @@ typedef struct vpx_codec_enc_cfg {
    * overshoot level (current rate vs target) beyond which more aggressive
    * corrective measures are taken.
    *
-   * Valid values in the range VP8:0-1000 VP9: 0-100.
+   * Valid values in the range VP8:0-100 VP9: 0-100.
    */
   unsigned int rc_overshoot_pct;
 
@@ -691,6 +695,151 @@ typedef struct vpx_codec_enc_cfg {
    *
    */
   int temporal_layering_mode;
+
+  /*!\brief A flag indicating whether to use external rate control parameters.
+   * By default is 0. If set to 1, the following parameters will be used in the
+   * rate control system.
+   */
+  int use_vizier_rc_params;
+
+  /*!\brief Active worst quality factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t active_wq_factor;
+
+  /*!\brief Error per macroblock adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t err_per_mb_factor;
+
+  /*!\brief Second reference default decay limit.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t sr_default_decay_limit;
+
+  /*!\brief Second reference difference factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t sr_diff_factor;
+
+  /*!\brief Keyframe error per macroblock adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_err_per_mb_factor;
+
+  /*!\brief Keyframe minimum boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_frame_min_boost_factor;
+
+  /*!\brief Keyframe maximum boost adjustment factor, for the first keyframe
+   * in a chunk.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_frame_max_boost_first_factor;
+
+  /*!\brief Keyframe maximum boost adjustment factor, for subsequent keyframes.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_frame_max_boost_subs_factor;
+
+  /*!\brief Keyframe maximum total boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_max_total_boost_factor;
+
+  /*!\brief Golden frame maximum total boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t gf_max_total_boost_factor;
+
+  /*!\brief Golden frame maximum boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t gf_frame_max_boost_factor;
+
+  /*!\brief Zero motion power factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t zm_factor;
+
+  /*!\brief Rate-distortion multiplier for inter frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_inter_qp_fac;
+
+  /*!\brief Rate-distortion multiplier for alt-ref frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_arf_qp_fac;
+
+  /*!\brief Rate-distortion multiplier for key frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_key_qp_fac;
 } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
 
 /*!\brief  vp9 svc extra configure parameters
@@ -705,11 +854,12 @@ typedef struct vpx_svc_parameters {
   int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */
   int speed_per_layer[VPX_MAX_LAYERS];    /**< Speed setting for each sl */
   int temporal_layering_mode;             /**< Temporal layering mode */
+  int loopfilter_ctrl[VPX_MAX_LAYERS];    /**< Loopfilter ctrl for each sl */
 } vpx_svc_extra_cfg_t;
 
 /*!\brief Initialize an encoder instance
  *
- * Initializes a encoder context using the given interface. Applications
+ * Initializes an encoder context using the given interface. Applications
  * should call the vpx_codec_enc_init convenience macro instead of this
  * function directly, to ensure that the ABI version number parameter
  * is properly initialized.
@@ -718,6 +868,9 @@ typedef struct vpx_svc_parameters {
  * is not thread safe and should be guarded with a lock if being used
  * in a multithreaded context.
  *
+ * If vpx_codec_enc_init_ver() fails, it is not necessary to call
+ * vpx_codec_destroy() on the encoder context.
+ *
  * \param[in]    ctx     Pointer to this instance's context.
  * \param[in]    iface   Pointer to the algorithm interface to use.
  * \param[in]    cfg     Configuration to use, if known. May be NULL.
@@ -757,7 +910,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
  * \param[in]    ver     ABI version number. Must be set to
  *                       VPX_ENCODER_ABI_VERSION
  * \retval #VPX_CODEC_OK
- *     The decoder algorithm initialized.
+ *     The encoder algorithm has been initialized.
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory allocation failed.
  */
@@ -826,12 +979,18 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx,
  */
 vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
 
+/*!\brief Encode Deadline
+ *
+ * This type indicates a deadline, in microseconds, to be passed to
+ * vpx_codec_encode().
+ */
+typedef unsigned long vpx_enc_deadline_t;
 /*!\brief deadline parameter analogous to VPx REALTIME mode. */
-#define VPX_DL_REALTIME (1)
+#define VPX_DL_REALTIME 1ul
 /*!\brief deadline parameter analogous to  VPx GOOD QUALITY mode. */
-#define VPX_DL_GOOD_QUALITY (1000000)
+#define VPX_DL_GOOD_QUALITY 1000000ul
 /*!\brief deadline parameter analogous to VPx BEST QUALITY mode. */
-#define VPX_DL_BEST_QUALITY (0)
+#define VPX_DL_BEST_QUALITY 0ul
 /*!\brief Encode a frame
  *
  * Encodes a video frame at the given "presentation time." The presentation
@@ -871,7 +1030,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
 vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
                                  vpx_codec_pts_t pts, unsigned long duration,
                                  vpx_enc_frame_flags_t flags,
-                                 unsigned long deadline);
+                                 vpx_enc_deadline_t deadline);
 
 /*!\brief Set compressed data output buffer
  *
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
new file mode 100644
index 0000000000..46d290dff4
--- /dev/null
+++ b/vpx/vpx_ext_ratectrl.h
@@ -0,0 +1,558 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_VPX_EXT_RATECTRL_H_
+#define VPX_VPX_VPX_EXT_RATECTRL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_integer.h"
+#include "./vpx_tpl.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures.
+ */
+#define VPX_EXT_RATECTRL_ABI_VERSION (7)
+
+/*!\brief The control type of the inference API.
+ * In VPX_RC_QP mode, the external rate control model determines the
+ * quantization parameter (QP) for each frame.
+ * In VPX_RC_GOP mode, the external rate control model determines the
+ * group of picture (GOP) of the video sequence.
+ * In VPX_RC_RDMULT mode, the external rate control model determines the
+ * rate-distortion multiplier (rdmult) for the current frame.
+ * In VPX_RC_GOP_QP mode, the external rate control model determines
+ * both the QP and the GOP.
+ * In VPX_RC_GOP_QP_RDMULT mode, the external rate control model determines
+ * the QP, GOP and the rdmult.
+ */
+typedef enum vpx_rc_type {
+  VPX_RC_QP = 1 << 0,
+  VPX_RC_GOP = 1 << 1,
+  VPX_RC_RDMULT = 1 << 2,
+  VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP,
+  VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT
+} vpx_rc_type_t;
+
+/*!\brief The rate control mode for the external rate control model.
+ */
+typedef enum vpx_ext_rc_mode {
+  VPX_RC_QMODE = 0,
+  VPX_RC_VBR = 1,
+  VPX_RC_CQ = 2,
+} vpx_ext_rc_mode_t;
+
+/*!\brief Abstract rate control model handler
+ *
+ * The encoder will receive the model handler from create_model() defined in
+ * vpx_rc_funcs_t.
+ */
+typedef void *vpx_rc_model_t;
+
+/*!\brief A reserved value for the q index.
+ * If the external rate control model returns this value,
+ * the encoder will use the default q selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_Q -1
+
+/*!\brief A reserved value for the rdmult.
+ * If the external rate control model returns this value,
+ * the encoder will use the default rdmult selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_RDMULT -1
+
+/*!\brief Encode frame decision made by the external rate control model
+ *
+ * The encoder will receive the decision from the external rate control model
+ * through get_encodeframe_decision() defined in vpx_rc_funcs_t.
+ *
+ * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q.
+ *
+ * If max_frame_size = 0, the encoding ignores max frame size limit.
+ * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
+ * If the encoded frame size is larger than max_frame_size, the frame is
+ * recoded to meet the size limit, following VP9's recoding principles.
+ */
+typedef struct vpx_rc_encodeframe_decision {
+  int q_index;        /**< Quantizer step index [0..255]*/
+  int max_frame_size; /**< Maximal frame size allowed to encode a frame*/
+} vpx_rc_encodeframe_decision_t;
+
+/*!\brief Information for the frame to be encoded.
+ *
+ * The encoder will send the information to external rate control model through
+ * get_encodeframe_decision() defined in vpx_rc_funcs_t.
+ *
+ */
+typedef struct vpx_rc_encodeframe_info {
+  /*!
+   * 0: Key frame
+   * 1: Inter frame
+   * 2: Alternate reference frame
+   * 3: Overlay frame
+   * 4: Golden frame
+   */
+  int frame_type;
+  int show_index;   /**< display index, starts from zero*/
+  int coding_index; /**< coding index, starts from zero*/
+  /*!
+   * index of the current frame in this group of picture, starts from zero.
+   */
+  int gop_index;
+  int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/
+  /*!
+   * The validity of the three reference frames.
+   * 0: Invalid
+   * 1: Valid
+   */
+  int ref_frame_valid_list[3];
+  /*!
+   * The length of the current GOP.
+   */
+  int gop_size;
+  /*!
+   * Whether the current GOP uses an alt ref.
+   */
+  int use_alt_ref;
+} vpx_rc_encodeframe_info_t;
+
+/*!\brief Frame coding result
+ *
+ * The encoder will send the result to the external rate control model through
+ * update_encodeframe_result() defined in vpx_rc_funcs_t.
+ */
+typedef struct vpx_rc_encodeframe_result {
+  int64_t sse;         /**< sum of squared error of the reconstructed frame */
+  int64_t bit_count;   /**< number of bits spent on coding the frame*/
+  int64_t pixel_count; /**< number of pixels in YUV planes of the frame*/
+  int actual_encoding_qindex; /**< the actual qindex used to encode the frame*/
+} vpx_rc_encodeframe_result_t;
+
+/*!\brief Status returned by rate control callback functions.
+ */
+typedef enum vpx_rc_status {
+  VPX_RC_OK = 0,
+  VPX_RC_ERROR = 1,
+} vpx_rc_status_t;
+
+/*!\brief First pass frame stats
+ * This is a mirror of vp9's FIRSTPASS_STATS except that spatial_layer_id is
+ * omitted
+ */
+typedef struct vpx_rc_frame_stats {
+  /*!
+   * Frame number in display order, if stats are for a single frame.
+   * No real meaning for a collection of frames.
+   */
+  double frame;
+  /*!
+   * Weight assigned to this frame (or total weight for the collection of
+   * frames) currently based on intra factor and brightness factor. This is used
+   * to distribute bits between easier and harder frames.
+   */
+  double weight;
+  /*!
+   * Intra prediction error.
+   */
+  double intra_error;
+  /*!
+   * Best of intra pred error and inter pred error using last frame as ref.
+   */
+  double coded_error;
+  /*!
+   * Best of intra pred error and inter pred error using golden frame as ref.
+   */
+  double sr_coded_error;
+  /*!
+   * Estimate the noise energy of the current frame.
+   */
+  double frame_noise_energy;
+  /*!
+   * Percentage of blocks with inter pred error < intra pred error.
+   */
+  double pcnt_inter;
+  /*!
+   * Percentage of blocks using (inter prediction and) non-zero motion vectors.
+   */
+  double pcnt_motion;
+  /*!
+   * Percentage of blocks where golden frame was better than last or intra:
+   * inter pred error using golden frame < inter pred error using last frame and
+   * inter pred error using golden frame < intra pred error
+   */
+  double pcnt_second_ref;
+  /*!
+   * Percentage of blocks where intra and inter prediction errors were very
+   * close.
+   */
+  double pcnt_neutral;
+  /*!
+   * Percentage of blocks that have intra error < inter error and inter error <
+   * LOW_I_THRESH
+   * - bit_depth 8: LOW_I_THRESH = 24000
+   * - bit_depth 10: LOW_I_THRESH = 24000 << 4
+   * - bit_depth 12: LOW_I_THRESH = 24000 << 8
+   */
+  double pcnt_intra_low;
+  /*!
+   * Percentage of blocks that have intra error < inter error and intra error <
+   * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH
+   * - bit_depth 8: LOW_I_THRESH = 24000
+   * - bit_depth 10: LOW_I_THRESH = 24000 << 4
+   * - bit_depth 12: LOW_I_THRESH = 24000 << 8
+   */
+  double pcnt_intra_high;
+  /*!
+   * Percentage of blocks that have almost no intra error residual
+   * (i.e. are in effect completely flat and untextured in the intra
+   * domain). In natural videos this is uncommon, but it is much more
+   * common in animations, graphics and screen content, so may be used
+   * as a signal to detect these types of content.
+   */
+  double intra_skip_pct;
+  /*!
+   * Percentage of blocks that have intra error < SMOOTH_INTRA_THRESH
+   * - bit_depth 8:  SMOOTH_INTRA_THRESH = 4000
+   * - bit_depth 10: SMOOTH_INTRA_THRESH = 4000 << 4
+   * - bit_depth 12: SMOOTH_INTRA_THRESH = 4000 << 8
+   */
+  double intra_smooth_pct;
+  /*!
+   * Image mask rows top and bottom.
+   */
+  double inactive_zone_rows;
+  /*!
+   * Image mask columns at left and right edges.
+   */
+  double inactive_zone_cols;
+  /*!
+   * Mean of row motion vectors.
+   */
+  double MVr;
+  /*!
+   * Mean of absolute value of row motion vectors.
+   */
+  double mvr_abs;
+  /*!
+   * Mean of column motion vectors.
+   */
+  double MVc;
+  /*!
+   * Mean of absolute value of column motion vectors.
+   */
+  double mvc_abs;
+  /*!
+   * Variance of row motion vectors.
+   */
+  double MVrv;
+  /*!
+   * Variance of column motion vectors.
+   */
+  double MVcv;
+  /*!
+   * Value in range [-1,1] indicating fraction of row and column motion vectors
+   * that point inwards (negative MV value) or outwards (positive MV value).
+   * For example, value of 1 indicates, all row/column MVs are inwards.
+   */
+  double mv_in_out_count;
+  /*!
+   * Duration of the frame / collection of frames.
+   */
+  double duration;
+  /*!
+   * 1.0 if stats are for a single frame, or
+   * number of frames whose stats are accumulated.
+   */
+  double count;
+  /*!
+   * Number of new mv in a frame.
+   */
+  double new_mv_count;
+} vpx_rc_frame_stats_t;
+
+/*!\brief Collection of first pass frame stats
+ */
+typedef struct vpx_rc_firstpass_stats {
+  /*!
+   * Pointer to first pass frame stats.
+   * The pointed array of vpx_rc_frame_stats_t should have length equal to
+   * number of show frames in the video.
+   */
+  vpx_rc_frame_stats_t *frame_stats;
+  /*!
+   * Number of show frames in the video.
+   */
+  int num_frames;
+} vpx_rc_firstpass_stats_t;
+
+/*!\brief Encode config sent to external rate control model
+ */
+typedef struct vpx_rc_config {
+  int frame_width;      /**< frame width */
+  int frame_height;     /**< frame height */
+  int show_frame_count; /**< number of visible frames in the video */
+  int max_gf_interval;  /**< max GOP size in number of show frames */
+  int min_gf_interval;  /**< min GOP size in number of show frames */
+  /*!
+   * Target bitrate in kilobytes per second
+   */
+  int target_bitrate_kbps;
+  int frame_rate_num; /**< numerator of frame rate */
+  int frame_rate_den; /**< denominator of frame rate */
+  /*!
+   * The following fields are only for external rate control models that support
+   * different rate control modes.
+   */
+  vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */
+  int overshoot_percent;     /**< for VBR mode only */
+  int undershoot_percent;    /**< for VBR mode only */
+} vpx_rc_config_t;
+
+/*!\brief Information passed to the external rate control model to
+ * help make GOP decisions.
+ */
+typedef struct vpx_rc_gop_info {
+  /*!
+   * Minimum allowed gf interval, fixed for the whole clip.
+   * Note that it will be modified to match vp9's level constraints
+   * in the encoder.
+   * The level constraint is defined in vp9_encoder.c:
+   * const Vp9LevelSpec vp9_level_defs[VP9_LEVELS].
+   */
+  int min_gf_interval;
+  /*!
+   * Maximum allowed gf interval, fixed for the whole clip.
+   */
+  int max_gf_interval;
+  /*!
+   * Minimum allowed gf interval for the current GOP, determined
+   * by the encoder.
+   */
+  int active_min_gf_interval;
+  /*!
+   * Maximum allowed gf interval for the current GOP, determined
+   * by the encoder.
+   */
+  int active_max_gf_interval;
+  /*!
+   * Whether to allow the use of alt ref, determined by the encoder.
+   * It is fixed for the entire encode.
+   * See function "is_altref_enabled" in vp9_encoder.h.
+   */
+  int allow_alt_ref;
+  /*!
+   * Is the current frame a key frame.
+   */
+  int is_key_frame;
+  /*!
+   * Does the previous gop use alt ref or not.
+   */
+  int last_gop_use_alt_ref;
+  /*!
+   * Current frame distance to the last keyframe, e.g., if Nth frame is a key,
+   * then the value of the N+1 th frame is 1.
+   */
+  int frames_since_key;
+  /*!
+   * Current frame distance to the next keyframe, e.g. if Nth frame is a key,
+   * then the value of frame N - 1 is 1.
+   */
+  int frames_to_key;
+  /*!
+   * Number of lookahead source frames.
+   */
+  int lag_in_frames;
+  /*!
+   * Display index (temporal stamp) of this frame in the whole clip,
+   * starts from zero.
+   */
+  int show_index;
+  /*!
+   * Coding index of this frame in the whole clip, starts from zero.
+   */
+  int coding_index;
+  /*!
+   * The index of the current gop, starts from zero, resets to zero
+   * when a keyframe is set.
+   */
+  int gop_global_index;
+} vpx_rc_gop_info_t;
+
+/*!\brief The decision made by the external rate control model to set the
+ * group of picture.
+ */
+typedef struct vpx_rc_gop_decision {
+  int gop_coding_frames; /**< The number of frames of this GOP */
+  int use_alt_ref;       /**< Whether to use alt ref for this GOP */
+} vpx_rc_gop_decision_t;
+
+/*!\brief Create an external rate control model callback prototype
+ *
+ * This callback is invoked by the encoder to create an external rate control
+ * model.
+ *
+ * \param[in]  priv                Callback's private data
+ * \param[in]  ratectrl_config     Pointer to vpx_rc_config_t
+ * \param[out] rate_ctrl_model_ptr Pointer to vpx_rc_model_t
+ */
+typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)(
+    void *priv, const vpx_rc_config_t *ratectrl_config,
+    vpx_rc_model_t *rate_ctrl_model_ptr);
+
+/*!\brief Send first pass stats to the external rate control model callback
+ * prototype
+ *
+ * This callback is invoked by the encoder to send first pass stats to the
+ * external rate control model.
+ *
+ * \param[in]  rate_ctrl_model    rate control model
+ * \param[in]  first_pass_stats   first pass stats
+ */
+typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_firstpass_stats_t *first_pass_stats);
+
+/*!\brief Send TPL stats for the current GOP to the external rate control model
+ * callback prototype
+ *
+ * This callback is invoked by the encoder to send TPL stats for the GOP to the
+ * external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  tpl_gop_stats    TPL stats for current GOP
+ */
+typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Receive encode frame decision callback prototype
+ *
+ * This callback is invoked by the encoder to receive encode frame decision from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model    rate control model
+ * \param[in]  encode_frame_info  information of the coding frame
+ * \param[out] frame_decision     encode decision of the coding frame
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision);
+
+/*!\brief Update encode frame result callback prototype
+ *
+ * This callback is invoked by the encoder to update encode frame result to the
+ * external rate control model.
+ *
+ * \param[in]  rate_ctrl_model     rate control model
+ * \param[out] encode_frame_result encode result of the coding frame
+ */
+typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_result_t *encode_frame_result);
+
+/*!\brief Get the GOP structure from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get GOP decisions from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  gop_info         information collected from the encoder
+ * \param[out] gop_decision     GOP decision from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+    vpx_rc_gop_decision_t *gop_decision);
+
+/*!\brief Get the frame rdmult from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get rdmult from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  frame_info       information collected from the encoder
+ * \param[out] rdmult           frame rate-distortion multiplier from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_frame_rdmult_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *frame_info,
+    int *rdmult);
+
+/*!\brief Delete the external rate control model callback prototype
+ *
+ * This callback is invoked by the encoder to delete the external rate control
+ * model.
+ *
+ * \param[in]  rate_ctrl_model     rate control model
+ */
+typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model);
+
+/*!\brief Callback function set for external rate control.
+ *
+ * The user can enable external rate control by registering
+ * a set of callback functions with the codec control flag
+ * VP9E_SET_EXTERNAL_RATE_CONTROL.
+ */
+typedef struct vpx_rc_funcs {
+  /*!
+   * The rate control type of this API.
+   */
+  vpx_rc_type_t rc_type;
+  /*!
+   * Create an external rate control model.
+   */
+  vpx_rc_create_model_cb_fn_t create_model;
+  /*!
+   * Send first pass stats to the external rate control model.
+   */
+  vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats;
+  /*!
+   * Send TPL stats for current GOP to the external rate control model.
+   */
+  vpx_rc_send_tpl_gop_stats_cb_fn_t send_tpl_gop_stats;
+  /*!
+   * Get encodeframe decision from the external rate control model.
+   */
+  vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision;
+  /*!
+   * Update encodeframe result to the external rate control model.
+   */
+  vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result;
+  /*!
+   * Get GOP decisions from the external rate control model.
+   */
+  vpx_rc_get_gop_decision_cb_fn_t get_gop_decision;
+  /*!
+   * Get rdmult for the frame from the external rate control model.
+   */
+  vpx_rc_get_frame_rdmult_cb_fn_t get_frame_rdmult;
+  /*!
+   * Delete the external rate control model.
+   */
+  vpx_rc_delete_model_cb_fn_t delete_model;
+  /*!
+   * Private data for the external rate control model.
+   */
+  void *priv;
+} vpx_rc_funcs_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_VPX_EXT_RATECTRL_H_
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index bc23be50c5..1adc9b9d9e 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -171,7 +171,8 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
 /*!\brief Set the rectangle identifying the displayed portion of the image
  *
  * Updates the displayed rectangle (aka viewport) on the image surface to
- * match the specified coordinates and size.
+ * match the specified coordinates and size. Specifically, sets img->d_w,
+ * img->d_h, and elements of the img->planes[] array.
  *
  * \param[in]    img       Image descriptor
  * \param[in]    x         leftmost column
@@ -179,7 +180,7 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
  * \param[in]    w         width
  * \param[in]    h         height
  *
- * \return 0 if the requested rectangle is valid, nonzero otherwise.
+ * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise.
  */
 int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h);
diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h
index 4129d156f8..34e3796411 100644
--- a/vpx/vpx_integer.h
+++ b/vpx/vpx_integer.h
@@ -12,7 +12,7 @@
 #define VPX_VPX_VPX_INTEGER_H_
 
 /* get ptrdiff_t, size_t, wchar_t, NULL */
-#include <stddef.h>
+#include <stddef.h>  // IWYU pragma: export
 
 #if defined(_MSC_VER)
 #define VPX_FORCE_INLINE __forceinline
@@ -34,7 +34,7 @@
 #endif
 #endif  // __cplusplus
 
-#include <inttypes.h>
-#include <stdint.h>
+#include <inttypes.h>  // IWYU pragma: export
+#include <stdint.h>    // IWYU pragma: export
 
 #endif  // VPX_VPX_VPX_INTEGER_H_
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
new file mode 100644
index 0000000000..a250aada60
--- /dev/null
+++ b/vpx/vpx_tpl.h
@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*!\file
+ * \brief Describes the TPL stats descriptor and associated operations
+ *
+ */
+#ifndef VPX_VPX_VPX_TPL_H_
+#define VPX_VPX_VPX_TPL_H_
+
+#include <stdio.h>
+
+#include "./vpx_integer.h"
+#include "./vpx_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/
+
+/*!\brief Temporal dependency model stats for each block before propagation */
+typedef struct VpxTplBlockStats {
+  int16_t row;         /**< Pixel row of the top left corner */
+  int16_t col;         /**< Pixel col of the top left corner */
+  int64_t intra_cost;  /**< Intra cost */
+  int64_t inter_cost;  /**< Inter cost */
+  int16_t mv_r;        /**< Motion vector row */
+  int16_t mv_c;        /**< Motion vector col */
+  int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
+  int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
+  int ref_frame_index; /**< Ref frame index in the ref frame buffer */
+} VpxTplBlockStats;
+
+/*!\brief Temporal dependency model stats for each frame before propagation */
+typedef struct VpxTplFrameStats {
+  int frame_width;  /**< Frame width */
+  int frame_height; /**< Frame height */
+  int num_blocks;   /**< Number of blocks. Size of block_stats_list */
+  VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
+} VpxTplFrameStats;
+
+/*!\brief Temporal dependency model stats for each GOP before propagation */
+typedef struct VpxTplGopStats {
+  int size; /**< GOP size, also the size of frame_stats_list. */
+  VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
+} VpxTplGopStats;
+
+/*!\brief Write VpxTplGopStats to file
+ *
+ * Accepts an opened file handle and writes \p tpl_gop_stats.
+ *
+ * \param[in]    tpl_file       A FILE pointer that's already been opened.
+ * \param[in]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                              whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully written.
+ */
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+                                        const VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Read VpxTplGopStats from file
+ *
+ * Accepts an opened file handle and reads TPL stats and stores them into
+ * \p tpl_gop_stats. Allocates memory for TPL stats.
+ *
+ * \param[in]     tpl_file       A FILE pointer that's already been opened.
+ * \param[out]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                               whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully read from file.
+ */
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+                                       VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Free the memory allocated for VpxTplGopStats
+ *
+ * \param[in]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                              whole GOP.
+ */
+void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_VPX_TPL_H_
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index fa7dd09600..1b17a326b4 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -22,15 +22,13 @@
 uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
   const uint8x16_t b = load_unaligned_u8q(a, a_stride);
   const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-  const uint32x2_t d = horizontal_add_uint16x8(c);
-  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
+  return (horizontal_add_uint16x8(c) + (1 << 3)) >> 4;
 }
 
 uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
   int i;
   uint8x8_t b, c;
   uint16x8_t sum;
-  uint32x2_t d;
   b = vld1_u8(a);
   a += a_stride;
   c = vld1_u8(a);
@@ -43,104 +41,98 @@ uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
     sum = vaddw_u8(sum, d);
   }
 
-  d = horizontal_add_uint16x8(sum);
-
-  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+  return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
 }
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
+// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
 int vpx_satd_neon(const tran_low_t *coeff, int length) {
-  const int16x4_t zero = vdup_n_s16(0);
-  int32x4_t accum = vdupq_n_s32(0);
+  int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
 
   do {
-    const int16x8_t src0 = load_tran_low_to_s16q(coeff);
-    const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8);
-    accum = vabal_s16(accum, vget_low_s16(src0), zero);
-    accum = vabal_s16(accum, vget_high_s16(src0), zero);
-    accum = vabal_s16(accum, vget_low_s16(src8), zero);
-    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    int16x8_t abs0, abs1;
+    const int16x8_t s0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8);
+
+    abs0 = vabsq_s16(s0);
+    sum_s32[0] = vpadalq_s16(sum_s32[0], abs0);
+    abs1 = vabsq_s16(s1);
+    sum_s32[1] = vpadalq_s16(sum_s32[1], abs1);
+
     length -= 16;
     coeff += 16;
   } while (length != 0);
 
-  {
-    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
-                                  vreinterpret_s32_s64(vget_high_s64(s0)));
-    const int satd = vget_lane_s32(s1, 0);
-    return satd;
-  }
+  return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
 }
 
 void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                           const int ref_stride, const int height) {
   int i;
-  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
-  const int shift_factor = ((height >> 5) + 3) * -1;
-  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
-
-  for (i = 0; i < height; i += 8) {
-    const uint8x16_t vec_row1 = vld1q_u8(ref);
-    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
-    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
-    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
-    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
-    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
-    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
-    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+  uint8x16_t r0, r1, r2, r3;
+  uint16x8_t sum_lo[2], sum_hi[2];
+  uint16x8_t tmp_lo[2], tmp_hi[2];
+  int16x8_t avg_lo, avg_hi;
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+  const int norm_factor = (height >> 5) + 3;
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+  assert(height >= 4 && height % 4 == 0);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+  r0 = vld1q_u8(ref + 0 * ref_stride);
+  r1 = vld1q_u8(ref + 1 * ref_stride);
+  r2 = vld1q_u8(ref + 2 * ref_stride);
+  r3 = vld1q_u8(ref + 3 * ref_stride);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+  sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+  sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+  sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+  sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+  ref += 4 * ref_stride;
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+  for (i = 4; i < height; i += 4) {
+    r0 = vld1q_u8(ref + 0 * ref_stride);
+    r1 = vld1q_u8(ref + 1 * ref_stride);
+    r2 = vld1q_u8(ref + 2 * ref_stride);
+    r3 = vld1q_u8(ref + 3 * ref_stride);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+    tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+    tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+    tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+    tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
+    sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
+    sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
+    sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
+    sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
 
-    ref += ref_stride * 8;
+    ref += 4 * ref_stride;
   }
 
-  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
-  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+  sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+  sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+
+  avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+  avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
 
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
-  hbuf += 8;
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+  vst1q_s16(hbuf, avg_lo);
+  vst1q_s16(hbuf + 8, avg_hi);
 }
 
 int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+  uint16x8_t sum;
   int i;
-  uint16x8_t vec_sum = vdupq_n_u16(0);
 
-  for (i = 0; i < width; i += 16) {
-    const uint8x16_t vec_row = vld1q_u8(ref);
-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
-    ref += 16;
+  assert(width >= 16 && width % 16 == 0);
+
+  sum = vpaddlq_u8(vld1q_u8(ref));
+  for (i = 16; i < width; i += 16) {
+    sum = vpadalq_u8(sum, vld1q_u8(ref + i));
   }
 
-  return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)),
-                       0);
+  return (int16_t)horizontal_add_uint16x8(sum);
 }
 
 // ref, src = [0, 510] - max diff = 16-bits
@@ -219,11 +211,16 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
   const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
 
-  // Split to D and start doing pairwise.
+#if VPX_ARCH_AARCH64
+  *min = *max = 0;  // Clear high bits
+  *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+  *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
   uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
   uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
 
-  // Enough runs of vpmax/min propogate the max/min values to every position.
+  // Enough runs of vpmax/min propagate the max/min values to every position.
   ab_max = vpmax_u8(ab_max, ab_max);
   ab_min = vpmin_u8(ab_min, ab_min);
 
@@ -237,4 +234,5 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   // Store directly to avoid costly neon->gpr transfer.
   vst1_lane_u8((uint8_t *)max, ab_max, 0);
   vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
 }
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index 6b2bebd097..fde71ff30d 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
 
 // Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
 // functions.
@@ -27,205 +28,9 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
 #else
 
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
-  b[0] = vld1q_s16(a);
-  a += stride;
-  b[1] = vld1q_s16(a);
-  a += stride;
-  b[2] = vld1q_s16(a);
-  a += stride;
-  b[3] = vld1q_s16(a);
-  a += stride;
-  b[4] = vld1q_s16(a);
-  a += stride;
-  b[5] = vld1q_s16(a);
-  a += stride;
-  b[6] = vld1q_s16(a);
-  a += stride;
-  b[7] = vld1q_s16(a);
-  a += stride;
-  b[8] = vld1q_s16(a);
-  a += stride;
-  b[9] = vld1q_s16(a);
-  a += stride;
-  b[10] = vld1q_s16(a);
-  a += stride;
-  b[11] = vld1q_s16(a);
-  a += stride;
-  b[12] = vld1q_s16(a);
-  a += stride;
-  b[13] = vld1q_s16(a);
-  a += stride;
-  b[14] = vld1q_s16(a);
-  a += stride;
-  b[15] = vld1q_s16(a);
-}
-
-// Store 8 16x8 values, assuming stride == 16.
-static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
-  store_s16q_to_tran_low(a, b[0]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[1]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[2]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[3]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[4]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[5]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[6]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[7]);
-}
-
-// Load step of each pass. Add and subtract clear across the input, requiring
-// all 16 values to be loaded. For the first pass it also multiplies by 4.
-
-// To maybe reduce register usage this could be combined with the load() step to
-// get the first 4 and last 4 values, cross those, then load the middle 8 values
-// and cross them.
-static INLINE void cross_input(const int16x8_t *a /*[16]*/,
-                               int16x8_t *b /*[16]*/, const int pass) {
-  if (pass == 0) {
-    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
-    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
-    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
-    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
-    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
-    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
-    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
-    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
-
-    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
-    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
-    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
-    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
-    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
-    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
-    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
-    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
-  } else {
-    b[0] = vaddq_s16(a[0], a[15]);
-    b[1] = vaddq_s16(a[1], a[14]);
-    b[2] = vaddq_s16(a[2], a[13]);
-    b[3] = vaddq_s16(a[3], a[12]);
-    b[4] = vaddq_s16(a[4], a[11]);
-    b[5] = vaddq_s16(a[5], a[10]);
-    b[6] = vaddq_s16(a[6], a[9]);
-    b[7] = vaddq_s16(a[7], a[8]);
-
-    b[8] = vsubq_s16(a[7], a[8]);
-    b[9] = vsubq_s16(a[6], a[9]);
-    b[10] = vsubq_s16(a[5], a[10]);
-    b[11] = vsubq_s16(a[4], a[11]);
-    b[12] = vsubq_s16(a[3], a[12]);
-    b[13] = vsubq_s16(a[2], a[13]);
-    b[14] = vsubq_s16(a[1], a[14]);
-    b[15] = vsubq_s16(a[0], a[15]);
-  }
-}
-
-// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
-// because this only adds 1, not 1 << 2.
-static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
-  const int16x8_t one = vdupq_n_s16(1);
-  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
-  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
-  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
-  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
-  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
-  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
-  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
-  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
-  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
-  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
-  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
-  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
-  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
-  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
-  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
-  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
-}
-
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t c, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t c0,
-                                       const tran_coef_t c1, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
-                                 int16x8_t *b /*[8]*/) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
 // Main body of fdct16x16.
-static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+                              int16x8_t *out /*[16]*/) {
   int16x8_t s[8];
   int16x8_t x[4];
   int16x8_t step[8];
@@ -250,16 +55,17 @@ static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
 
   // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
   // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
-  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[8]);
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
   // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
 
   //  Stage 2
   // Re-using source s5/s6
   // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
   // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
 
   //  Stage 3
   x[0] = vaddq_s16(s[4], s[5]);
@@ -268,12 +74,12 @@ static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
   x[3] = vaddq_s16(s[7], s[6]);
 
   // Stage 4
-  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
-  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
-  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
-  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
 
   // step 2
   // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -285,8 +91,8 @@ static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
   // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
   // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
   // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
-  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+  butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
 
   // step 3
   s[0] = vaddq_s16(in[8], s[3]);
@@ -299,13 +105,15 @@ static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
   s[7] = vaddq_s16(in[15], s[4]);
 
   // step 4
-  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
-  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
-  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
 
   // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
-  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
 
   // step 5
   step[0] = vaddq_s16(s[0], s[1]);
@@ -318,22 +126,23 @@ static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
   step[7] = vaddq_s16(s[7], s[6]);
 
   // step 6
-  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
-  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
-  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
-  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
-  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
-  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
-  // cospi_22_64)
-  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
-  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
-  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
                       &out[7]);
-  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
                       &out[15]);
-  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
                       &out[3]);
-  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
                       &out[11]);
 }
 
@@ -344,22 +153,23 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp3[16];
 
   // Left half.
-  load(input, stride, temp0);
-  cross_input(temp0, temp1, 0);
-  dct_body(temp1, temp0);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp1);
+  vpx_fdct8x16_body(temp1, temp0);
 
   // Right half.
-  load(input + 8, stride, temp1);
-  cross_input(temp1, temp2, 0);
-  dct_body(temp2, temp1);
+  load_cross(input + 8, stride, temp1);
+  scale_input(temp1, temp2);
+  vpx_fdct8x16_body(temp2, temp1);
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
-  transpose_8x8(&temp0[0], &temp2[0]);
-  transpose_8x8(&temp1[0], &temp2[8]);
+
+  transpose_s16_8x8q(&temp0[0], &temp2[0]);
+  transpose_s16_8x8q(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
-  cross_input(temp2, temp3, 1);
-  dct_body(temp3, temp2);
+  cross_input(temp2, temp3);
+  vpx_fdct8x16_body(temp3, temp2);
   transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
                     &temp2[5], &temp2[6], &temp2[7]);
   transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -370,12 +180,13 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
-  transpose_8x8(&temp0[8], &temp1[0]);
+  transpose_s16_8x8q(&temp0[8], &temp1[0]);
+
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
-  cross_input(temp1, temp0, 1);
-  dct_body(temp0, temp1);
+  cross_input(temp1, temp0);
+  vpx_fdct8x16_body(temp0, temp1);
   transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
                     &temp1[5], &temp1[6], &temp1[7]);
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
@@ -383,5 +194,246 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output, temp1);
   store(output + 8, temp1 + 8);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+                                     int32x4_t *right /* [16] */) {
+  int32x4_t sl[8];
+  int32x4_t sr[8];
+  int32x4_t xl[4];
+  int32x4_t xr[4];
+  int32x4_t inl[8];
+  int32x4_t inr[8];
+  int32x4_t stepl[8];
+  int32x4_t stepr[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // Copy values 8-15 as we're storing in-place
+  inl[0] = left[8];
+  inr[0] = right[8];
+  inl[1] = left[9];
+  inr[1] = right[9];
+  inl[2] = left[10];
+  inr[2] = right[10];
+  inl[3] = left[11];
+  inr[3] = right[11];
+  inl[4] = left[12];
+  inr[4] = right[12];
+  inl[5] = left[13];
+  inr[5] = right[13];
+  inl[6] = left[14];
+  inr[6] = right[14];
+  inl[7] = left[15];
+  inr[7] = right[15];
+
+  // fdct4(step, step);
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[8], &right[8]);
+
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[4], &right[4],
+                                     &left[12], &right[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+                               &sr[6], &sl[5], &sr[5]);
+
+  //  Stage 3
+  xl[0] = vaddq_s32(sl[4], sl[5]);
+  xr[0] = vaddq_s32(sr[4], sr[5]);
+  xl[1] = vsubq_s32(sl[4], sl[5]);
+  xr[1] = vsubq_s32(sr[4], sr[5]);
+  xl[2] = vsubq_s32(sl[7], sl[6]);
+  xr[2] = vsubq_s32(sr[7], sr[6]);
+  xl[3] = vaddq_s32(sl[7], sl[6]);
+  xr[3] = vaddq_s32(sr[7], sr[6]);
+
+  // Stage 4
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[2], &right[2],
+                                     &left[14], &right[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[10], &right[10],
+                                     &left[6], &right[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+                               &sl[5], &sr[5], &sl[2], &sr[2]);
+  butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+                               &sl[4], &sr[4], &sl[3], &sr[3]);
+
+  // step 3
+  sl[0] = vaddq_s32(inl[0], sl[3]);
+  sr[0] = vaddq_s32(inr[0], sr[3]);
+  sl[1] = vaddq_s32(inl[1], sl[2]);
+  sr[1] = vaddq_s32(inr[1], sr[2]);
+  xl[0] = vsubq_s32(inl[1], sl[2]);
+  xr[0] = vsubq_s32(inr[1], sr[2]);
+  xl[1] = vsubq_s32(inl[0], sl[3]);
+  xr[1] = vsubq_s32(inr[0], sr[3]);
+  xl[2] = vsubq_s32(inl[7], sl[4]);
+  xr[2] = vsubq_s32(inr[7], sr[4]);
+  xl[3] = vsubq_s32(inl[6], sl[5]);
+  xr[3] = vsubq_s32(inr[6], sr[5]);
+  sl[6] = vaddq_s32(inl[6], sl[5]);
+  sr[6] = vaddq_s32(inr[6], sr[5]);
+  sl[7] = vaddq_s32(inl[7], sl[4]);
+  sr[7] = vaddq_s32(inr[7], sr[4]);
+
+  // step 4
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+                                     cospi_24_64, &sl[6], &sr[6], &sl[1],
+                                     &sr[1]);
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+                                     cospi_8_64, &sl[2], &sr[2], &sl[5],
+                                     &sr[5]);
+
+  // step 5
+  stepl[0] = vaddq_s32(sl[0], sl[1]);
+  stepr[0] = vaddq_s32(sr[0], sr[1]);
+  stepl[1] = vsubq_s32(sl[0], sl[1]);
+  stepr[1] = vsubq_s32(sr[0], sr[1]);
+  stepl[2] = vaddq_s32(xl[1], sl[2]);
+  stepr[2] = vaddq_s32(xr[1], sr[2]);
+  stepl[3] = vsubq_s32(xl[1], sl[2]);
+  stepr[3] = vsubq_s32(xr[1], sr[2]);
+  stepl[4] = vsubq_s32(xl[2], sl[5]);
+  stepr[4] = vsubq_s32(xr[2], sr[5]);
+  stepl[5] = vaddq_s32(xl[2], sl[5]);
+  stepr[5] = vaddq_s32(xr[2], sr[5]);
+  stepl[6] = vsubq_s32(sl[7], sl[6]);
+  stepr[6] = vsubq_s32(sr[7], sr[6]);
+  stepl[7] = vaddq_s32(sl[7], sl[6]);
+  stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+  // step 6
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
+}
+
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  int16x8_t temp0[16];
+  int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16],
+      right3[16], right4[16];
+
+  // Left half.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
+
+  // right half.
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
+
+  // Transpose top left and top right quarters into one contiguous location to
+  // process to the top half.
+
+  transpose_s32_8x8_2(left1, right1, left3, right3);
+  transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+
+  highbd_partial_round_shift(left3, right3);
+  highbd_cross_input(left3, right3, left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
+
+  // Transpose bottom left and bottom right quarters into one contiguous
+  // location to process to the bottom half.
+
+  highbd_partial_round_shift(left4, right4);
+  highbd_cross_input(left4, right4, left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
+
+  transpose_s32_8x8_2(left1, right1, left3, right3);
+  transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+  store16_s32(output, left3);
+  output += 4;
+  store16_s32(output, right3);
+  output += 4;
+
+  store16_s32(output, left4);
+  output += 4;
+  store16_s32(output, right4);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
         // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
new file mode 100644
index 0000000000..cd58675ca4
--- /dev/null
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -0,0 +1,318 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+
+#include <arm_neon.h>
+
+#include "fdct_neon.h"
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+  b[0] = vld1q_s16(a);
+  a += stride;
+  b[1] = vld1q_s16(a);
+  a += stride;
+  b[2] = vld1q_s16(a);
+  a += stride;
+  b[3] = vld1q_s16(a);
+  a += stride;
+  b[4] = vld1q_s16(a);
+  a += stride;
+  b[5] = vld1q_s16(a);
+  a += stride;
+  b[6] = vld1q_s16(a);
+  a += stride;
+  b[7] = vld1q_s16(a);
+  a += stride;
+  b[8] = vld1q_s16(a);
+  a += stride;
+  b[9] = vld1q_s16(a);
+  a += stride;
+  b[10] = vld1q_s16(a);
+  a += stride;
+  b[11] = vld1q_s16(a);
+  a += stride;
+  b[12] = vld1q_s16(a);
+  a += stride;
+  b[13] = vld1q_s16(a);
+  a += stride;
+  b[14] = vld1q_s16(a);
+  a += stride;
+  b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+  store_s16q_to_tran_low(a, b[0]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[1]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[2]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[3]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[4]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[5]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[6]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void scale_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/) {
+  b[0] = vshlq_n_s16(a[0], 2);
+  b[1] = vshlq_n_s16(a[1], 2);
+  b[2] = vshlq_n_s16(a[2], 2);
+  b[3] = vshlq_n_s16(a[3], 2);
+  b[4] = vshlq_n_s16(a[4], 2);
+  b[5] = vshlq_n_s16(a[5], 2);
+  b[6] = vshlq_n_s16(a[6], 2);
+  b[7] = vshlq_n_s16(a[7], 2);
+
+  b[8] = vshlq_n_s16(a[8], 2);
+  b[9] = vshlq_n_s16(a[9], 2);
+  b[10] = vshlq_n_s16(a[10], 2);
+  b[11] = vshlq_n_s16(a[11], 2);
+  b[12] = vshlq_n_s16(a[12], 2);
+  b[13] = vshlq_n_s16(a[13], 2);
+  b[14] = vshlq_n_s16(a[14], 2);
+  b[15] = vshlq_n_s16(a[15], 2);
+}
+
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/) {
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+}
+
+static INLINE void load_cross(const int16_t *a, int stride,
+                              int16x8_t *b /*[16]*/) {
+  b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+  b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+  b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+  b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+  b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+  b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+  b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+  b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+
+  b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+  b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+  b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+  b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+  b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+  b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+  b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+  b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+  const int16x8_t one = vdupq_n_s16(1);
+  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
+                                      int32x4_t *left /*[16]*/,
+                                      int32x4_t *right /* [16] */) {
+  left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+  left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+  left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+  left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+  left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+  left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+  left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+  left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+  left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+  right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
+                                      int32x4_t *a_right /*[16]*/,
+                                      int32x4_t *b_left /*[16]*/,
+                                      int32x4_t *b_right /*[16]*/) {
+  b_left[0] = vaddq_s32(a_left[0], a_left[15]);
+  b_left[1] = vaddq_s32(a_left[1], a_left[14]);
+  b_left[2] = vaddq_s32(a_left[2], a_left[13]);
+  b_left[3] = vaddq_s32(a_left[3], a_left[12]);
+  b_left[4] = vaddq_s32(a_left[4], a_left[11]);
+  b_left[5] = vaddq_s32(a_left[5], a_left[10]);
+  b_left[6] = vaddq_s32(a_left[6], a_left[9]);
+  b_left[7] = vaddq_s32(a_left[7], a_left[8]);
+
+  b_right[0] = vaddq_s32(a_right[0], a_right[15]);
+  b_right[1] = vaddq_s32(a_right[1], a_right[14]);
+  b_right[2] = vaddq_s32(a_right[2], a_right[13]);
+  b_right[3] = vaddq_s32(a_right[3], a_right[12]);
+  b_right[4] = vaddq_s32(a_right[4], a_right[11]);
+  b_right[5] = vaddq_s32(a_right[5], a_right[10]);
+  b_right[6] = vaddq_s32(a_right[6], a_right[9]);
+  b_right[7] = vaddq_s32(a_right[7], a_right[8]);
+
+  b_left[8] = vsubq_s32(a_left[7], a_left[8]);
+  b_left[9] = vsubq_s32(a_left[6], a_left[9]);
+  b_left[10] = vsubq_s32(a_left[5], a_left[10]);
+  b_left[11] = vsubq_s32(a_left[4], a_left[11]);
+  b_left[12] = vsubq_s32(a_left[3], a_left[12]);
+  b_left[13] = vsubq_s32(a_left[2], a_left[13]);
+  b_left[14] = vsubq_s32(a_left[1], a_left[14]);
+  b_left[15] = vsubq_s32(a_left[0], a_left[15]);
+
+  b_right[8] = vsubq_s32(a_right[7], a_right[8]);
+  b_right[9] = vsubq_s32(a_right[6], a_right[9]);
+  b_right[10] = vsubq_s32(a_right[5], a_right[10]);
+  b_right[11] = vsubq_s32(a_right[4], a_right[11]);
+  b_right[12] = vsubq_s32(a_right[3], a_right[12]);
+  b_right[13] = vsubq_s32(a_right[2], a_right[13]);
+  b_right[14] = vsubq_s32(a_right[1], a_right[14]);
+  b_right[15] = vsubq_s32(a_right[0], a_right[15]);
+}
+
+static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
+                                              int32x4_t *right /* [16] */) {
+  const int32x4_t one = vdupq_n_s32(1);
+  left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
+  left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
+  left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
+  left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
+  left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
+  left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
+  left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
+  left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
+  left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
+  left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
+  left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
+  left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
+  left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
+  left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
+  left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
+  left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
+
+  right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+  right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+  right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+  right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+  right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+  right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+  right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+  right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+  right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+  right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+  right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+  right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+  right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+  right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+  right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+  right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
+}
+
+// Store 16 32x4 vectors, assuming stride == 16.
+static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
+  vst1q_s32(a, b[0]);
+  a += 16;
+  vst1q_s32(a, b[1]);
+  a += 16;
+  vst1q_s32(a, b[2]);
+  a += 16;
+  vst1q_s32(a, b[3]);
+  a += 16;
+  vst1q_s32(a, b[4]);
+  a += 16;
+  vst1q_s32(a, b[5]);
+  a += 16;
+  vst1q_s32(a, b[6]);
+  a += 16;
+  vst1q_s32(a, b[7]);
+  a += 16;
+  vst1q_s32(a, b[8]);
+  a += 16;
+  vst1q_s32(a, b[9]);
+  a += 16;
+  vst1q_s32(a, b[10]);
+  a += 16;
+  vst1q_s32(a, b[11]);
+  a += 16;
+  vst1q_s32(a, b[12]);
+  a += 16;
+  vst1q_s32(a, b[13]);
+  a += 16;
+  vst1q_s32(a, b[14]);
+  a += 16;
+  vst1q_s32(a, b[15]);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index e9cd34904b..a91730ce8b 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -15,6 +15,8 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
 
 // Most gcc 4.9 distributions outside of Android do not generate correct code
 // for this function.
@@ -32,1289 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
 
 #else
 
-#define LOAD_INCREMENT(src, stride, dest, index) \
-  do {                                           \
-    dest[index] = vld1q_s16(src);                \
-    src += stride;                               \
-  } while (0)
-
-#define ADD_S16(src, index0, index1, dest, index3)      \
-  do {                                                  \
-    dest[index3] = vaddq_s16(src[index0], src[index1]); \
-  } while (0)
-
-#define ADD_SHIFT_S16(src, index0, index1)                             \
-  do {                                                                 \
-    src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
-  } while (0)
-
-// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
-// middle
-// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
-  const int16_t *a_end = a + 24 * stride;
-  int16x8_t c[8];
-
-  LOAD_INCREMENT(a, stride, b, 0);
-  LOAD_INCREMENT(a, stride, b, 1);
-  LOAD_INCREMENT(a, stride, b, 2);
-  LOAD_INCREMENT(a, stride, b, 3);
-  LOAD_INCREMENT(a, stride, b, 4);
-  LOAD_INCREMENT(a, stride, b, 5);
-  LOAD_INCREMENT(a, stride, b, 6);
-  LOAD_INCREMENT(a, stride, b, 7);
-
-  LOAD_INCREMENT(a_end, stride, b, 24);
-  LOAD_INCREMENT(a_end, stride, b, 25);
-  LOAD_INCREMENT(a_end, stride, b, 26);
-  LOAD_INCREMENT(a_end, stride, b, 27);
-  LOAD_INCREMENT(a_end, stride, b, 28);
-  LOAD_INCREMENT(a_end, stride, b, 29);
-  LOAD_INCREMENT(a_end, stride, b, 30);
-  LOAD_INCREMENT(a_end, stride, b, 31);
-
-  ADD_S16(b, 0, 31, c, 0);
-  ADD_S16(b, 1, 30, c, 1);
-  ADD_S16(b, 2, 29, c, 2);
-  ADD_S16(b, 3, 28, c, 3);
-  ADD_S16(b, 4, 27, c, 4);
-  ADD_S16(b, 5, 26, c, 5);
-  ADD_S16(b, 6, 25, c, 6);
-  ADD_S16(b, 7, 24, c, 7);
-
-  ADD_SHIFT_S16(b, 7, 24);
-  ADD_SHIFT_S16(b, 6, 25);
-  ADD_SHIFT_S16(b, 5, 26);
-  ADD_SHIFT_S16(b, 4, 27);
-  ADD_SHIFT_S16(b, 3, 28);
-  ADD_SHIFT_S16(b, 2, 29);
-  ADD_SHIFT_S16(b, 1, 30);
-  ADD_SHIFT_S16(b, 0, 31);
-
-  b[0] = vshlq_n_s16(c[0], 2);
-  b[1] = vshlq_n_s16(c[1], 2);
-  b[2] = vshlq_n_s16(c[2], 2);
-  b[3] = vshlq_n_s16(c[3], 2);
-  b[4] = vshlq_n_s16(c[4], 2);
-  b[5] = vshlq_n_s16(c[5], 2);
-  b[6] = vshlq_n_s16(c[6], 2);
-  b[7] = vshlq_n_s16(c[7], 2);
-
-  LOAD_INCREMENT(a, stride, b, 8);
-  LOAD_INCREMENT(a, stride, b, 9);
-  LOAD_INCREMENT(a, stride, b, 10);
-  LOAD_INCREMENT(a, stride, b, 11);
-  LOAD_INCREMENT(a, stride, b, 12);
-  LOAD_INCREMENT(a, stride, b, 13);
-  LOAD_INCREMENT(a, stride, b, 14);
-  LOAD_INCREMENT(a, stride, b, 15);
-  LOAD_INCREMENT(a, stride, b, 16);
-  LOAD_INCREMENT(a, stride, b, 17);
-  LOAD_INCREMENT(a, stride, b, 18);
-  LOAD_INCREMENT(a, stride, b, 19);
-  LOAD_INCREMENT(a, stride, b, 20);
-  LOAD_INCREMENT(a, stride, b, 21);
-  LOAD_INCREMENT(a, stride, b, 22);
-  LOAD_INCREMENT(a, stride, b, 23);
-
-  ADD_S16(b, 8, 23, c, 0);
-  ADD_S16(b, 9, 22, c, 1);
-  ADD_S16(b, 10, 21, c, 2);
-  ADD_S16(b, 11, 20, c, 3);
-  ADD_S16(b, 12, 19, c, 4);
-  ADD_S16(b, 13, 18, c, 5);
-  ADD_S16(b, 14, 17, c, 6);
-  ADD_S16(b, 15, 16, c, 7);
-
-  ADD_SHIFT_S16(b, 15, 16);
-  ADD_SHIFT_S16(b, 14, 17);
-  ADD_SHIFT_S16(b, 13, 18);
-  ADD_SHIFT_S16(b, 12, 19);
-  ADD_SHIFT_S16(b, 11, 20);
-  ADD_SHIFT_S16(b, 10, 21);
-  ADD_SHIFT_S16(b, 9, 22);
-  ADD_SHIFT_S16(b, 8, 23);
-
-  b[8] = vshlq_n_s16(c[0], 2);
-  b[9] = vshlq_n_s16(c[1], 2);
-  b[10] = vshlq_n_s16(c[2], 2);
-  b[11] = vshlq_n_s16(c[3], 2);
-  b[12] = vshlq_n_s16(c[4], 2);
-  b[13] = vshlq_n_s16(c[5], 2);
-  b[14] = vshlq_n_s16(c[6], 2);
-  b[15] = vshlq_n_s16(c[7], 2);
-}
-
-#undef LOAD_INCREMENT
-#undef ADD_S16
-#undef ADD_SHIFT_S16
-
-#define STORE_S16(src, index, dest)           \
-  do {                                        \
-    store_s16q_to_tran_low(dest, src[index]); \
-    dest += 8;                                \
-  } while (0);
-
-// Store 32 16x8 values, assuming stride == 32.
-// Slight twist: store horizontally in blocks of 8.
-static INLINE void store(tran_low_t *a, const int16x8_t *b) {
-  STORE_S16(b, 0, a);
-  STORE_S16(b, 8, a);
-  STORE_S16(b, 16, a);
-  STORE_S16(b, 24, a);
-  STORE_S16(b, 1, a);
-  STORE_S16(b, 9, a);
-  STORE_S16(b, 17, a);
-  STORE_S16(b, 25, a);
-  STORE_S16(b, 2, a);
-  STORE_S16(b, 10, a);
-  STORE_S16(b, 18, a);
-  STORE_S16(b, 26, a);
-  STORE_S16(b, 3, a);
-  STORE_S16(b, 11, a);
-  STORE_S16(b, 19, a);
-  STORE_S16(b, 27, a);
-  STORE_S16(b, 4, a);
-  STORE_S16(b, 12, a);
-  STORE_S16(b, 20, a);
-  STORE_S16(b, 28, a);
-  STORE_S16(b, 5, a);
-  STORE_S16(b, 13, a);
-  STORE_S16(b, 21, a);
-  STORE_S16(b, 29, a);
-  STORE_S16(b, 6, a);
-  STORE_S16(b, 14, a);
-  STORE_S16(b, 22, a);
-  STORE_S16(b, 30, a);
-  STORE_S16(b, 7, a);
-  STORE_S16(b, 15, a);
-  STORE_S16(b, 23, a);
-  STORE_S16(b, 31, a);
-}
-
-#undef STORE_S16
-
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t constant,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t constant0,
-                                       const tran_coef_t constant1,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Add 2 if positive, 1 if negative, and shift by 2.
-// In practice, subtract the sign bit, then shift with rounding.
-static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
-  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
-  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
-  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
-  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
-}
-
-static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-
-  // Stage 1: Done as part of the load.
-
-  // Stage 2.
-  // Mini cross. X the first 16 values and the middle 8 of the second half.
-  a[0] = vaddq_s16(in[0], in[15]);
-  a[1] = vaddq_s16(in[1], in[14]);
-  a[2] = vaddq_s16(in[2], in[13]);
-  a[3] = vaddq_s16(in[3], in[12]);
-  a[4] = vaddq_s16(in[4], in[11]);
-  a[5] = vaddq_s16(in[5], in[10]);
-  a[6] = vaddq_s16(in[6], in[9]);
-  a[7] = vaddq_s16(in[7], in[8]);
-
-  a[8] = vsubq_s16(in[7], in[8]);
-  a[9] = vsubq_s16(in[6], in[9]);
-  a[10] = vsubq_s16(in[5], in[10]);
-  a[11] = vsubq_s16(in[4], in[11]);
-  a[12] = vsubq_s16(in[3], in[12]);
-  a[13] = vsubq_s16(in[2], in[13]);
-  a[14] = vsubq_s16(in[1], in[14]);
-  a[15] = vsubq_s16(in[0], in[15]);
-
-  a[16] = in[16];
-  a[17] = in[17];
-  a[18] = in[18];
-  a[19] = in[19];
-
-  butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
-  butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
-  butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
-  butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
-
-  a[28] = in[28];
-  a[29] = in[29];
-  a[30] = in[30];
-  a[31] = in[31];
-
-  // Stage 3.
-  b[0] = vaddq_s16(a[0], a[7]);
-  b[1] = vaddq_s16(a[1], a[6]);
-  b[2] = vaddq_s16(a[2], a[5]);
-  b[3] = vaddq_s16(a[3], a[4]);
-
-  b[4] = vsubq_s16(a[3], a[4]);
-  b[5] = vsubq_s16(a[2], a[5]);
-  b[6] = vsubq_s16(a[1], a[6]);
-  b[7] = vsubq_s16(a[0], a[7]);
-
-  b[8] = a[8];
-  b[9] = a[9];
-
-  butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
-  butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
-
-  b[14] = a[14];
-  b[15] = a[15];
-
-  b[16] = vaddq_s16(in[16], a[23]);
-  b[17] = vaddq_s16(in[17], a[22]);
-  b[18] = vaddq_s16(in[18], a[21]);
-  b[19] = vaddq_s16(in[19], a[20]);
-
-  b[20] = vsubq_s16(in[19], a[20]);
-  b[21] = vsubq_s16(in[18], a[21]);
-  b[22] = vsubq_s16(in[17], a[22]);
-  b[23] = vsubq_s16(in[16], a[23]);
-
-  b[24] = vsubq_s16(in[31], a[24]);
-  b[25] = vsubq_s16(in[30], a[25]);
-  b[26] = vsubq_s16(in[29], a[26]);
-  b[27] = vsubq_s16(in[28], a[27]);
-
-  b[28] = vaddq_s16(in[28], a[27]);
-  b[29] = vaddq_s16(in[29], a[26]);
-  b[30] = vaddq_s16(in[30], a[25]);
-  b[31] = vaddq_s16(in[31], a[24]);
-
-  // Stage 4.
-  a[0] = vaddq_s16(b[0], b[3]);
-  a[1] = vaddq_s16(b[1], b[2]);
-  a[2] = vsubq_s16(b[1], b[2]);
-  a[3] = vsubq_s16(b[0], b[3]);
-
-  a[4] = b[4];
-
-  butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
-
-  a[7] = b[7];
-
-  a[8] = vaddq_s16(b[8], b[11]);
-  a[9] = vaddq_s16(b[9], b[10]);
-  a[10] = vsubq_s16(b[9], b[10]);
-  a[11] = vsubq_s16(b[8], b[11]);
-  a[12] = vsubq_s16(b[15], b[12]);
-  a[13] = vsubq_s16(b[14], b[13]);
-  a[14] = vaddq_s16(b[14], b[13]);
-  a[15] = vaddq_s16(b[15], b[12]);
-
-  a[16] = b[16];
-  a[17] = b[17];
-
-  butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
-  butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
-  butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
-  butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
-
-  a[22] = b[22];
-  a[23] = b[23];
-  a[24] = b[24];
-  a[25] = b[25];
-
-  a[30] = b[30];
-  a[31] = b[31];
-
-  // Stage 5.
-  butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
-  butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
-
-  b[4] = vaddq_s16(a[4], a[5]);
-  b[5] = vsubq_s16(a[4], a[5]);
-  b[6] = vsubq_s16(a[7], a[6]);
-  b[7] = vaddq_s16(a[7], a[6]);
-
-  b[8] = a[8];
-
-  butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
-  butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
-
-  b[11] = a[11];
-  b[12] = a[12];
-
-  b[15] = a[15];
-
-  b[16] = vaddq_s16(a[19], a[16]);
-  b[17] = vaddq_s16(a[18], a[17]);
-  b[18] = vsubq_s16(a[17], a[18]);
-  b[19] = vsubq_s16(a[16], a[19]);
-  b[20] = vsubq_s16(a[23], a[20]);
-  b[21] = vsubq_s16(a[22], a[21]);
-  b[22] = vaddq_s16(a[21], a[22]);
-  b[23] = vaddq_s16(a[20], a[23]);
-  b[24] = vaddq_s16(a[27], a[24]);
-  b[25] = vaddq_s16(a[26], a[25]);
-  b[26] = vsubq_s16(a[25], a[26]);
-  b[27] = vsubq_s16(a[24], a[27]);
-  b[28] = vsubq_s16(a[31], a[28]);
-  b[29] = vsubq_s16(a[30], a[29]);
-  b[30] = vaddq_s16(a[29], a[30]);
-  b[31] = vaddq_s16(a[28], a[31]);
-
-  // Stage 6.
-  a[0] = b[0];
-  a[1] = b[1];
-  a[2] = b[2];
-  a[3] = b[3];
-
-  butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
-  butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
-
-  a[8] = vaddq_s16(b[8], b[9]);
-  a[9] = vsubq_s16(b[8], b[9]);
-  a[10] = vsubq_s16(b[11], b[10]);
-  a[11] = vaddq_s16(b[11], b[10]);
-  a[12] = vaddq_s16(b[12], b[13]);
-  a[13] = vsubq_s16(b[12], b[13]);
-  a[14] = vsubq_s16(b[15], b[14]);
-  a[15] = vaddq_s16(b[15], b[14]);
-
-  a[16] = b[16];
-  a[19] = b[19];
-  a[20] = b[20];
-  a[23] = b[23];
-  a[24] = b[24];
-  a[27] = b[27];
-  a[28] = b[28];
-  a[31] = b[31];
-
-  butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
-  butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
-
-  butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
-  butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
-
-  // Stage 7.
-  b[0] = a[0];
-  b[1] = a[1];
-  b[2] = a[2];
-  b[3] = a[3];
-  b[4] = a[4];
-  b[5] = a[5];
-  b[6] = a[6];
-  b[7] = a[7];
-
-  butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
-  butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
-  butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
-  butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
-
-  b[16] = vaddq_s16(a[16], a[17]);
-  b[17] = vsubq_s16(a[16], a[17]);
-  b[18] = vsubq_s16(a[19], a[18]);
-  b[19] = vaddq_s16(a[19], a[18]);
-  b[20] = vaddq_s16(a[20], a[21]);
-  b[21] = vsubq_s16(a[20], a[21]);
-  b[22] = vsubq_s16(a[23], a[22]);
-  b[23] = vaddq_s16(a[23], a[22]);
-  b[24] = vaddq_s16(a[24], a[25]);
-  b[25] = vsubq_s16(a[24], a[25]);
-  b[26] = vsubq_s16(a[27], a[26]);
-  b[27] = vaddq_s16(a[27], a[26]);
-  b[28] = vaddq_s16(a[28], a[29]);
-  b[29] = vsubq_s16(a[28], a[29]);
-  b[30] = vsubq_s16(a[31], a[30]);
-  b[31] = vaddq_s16(a[31], a[30]);
-
-  // Final stage.
-  // Also compute partial rounding shift:
-  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  out[0] = sub_round_shift(b[0]);
-  out[16] = sub_round_shift(b[1]);
-  out[8] = sub_round_shift(b[2]);
-  out[24] = sub_round_shift(b[3]);
-  out[4] = sub_round_shift(b[4]);
-  out[20] = sub_round_shift(b[5]);
-  out[12] = sub_round_shift(b[6]);
-  out[28] = sub_round_shift(b[7]);
-  out[2] = sub_round_shift(b[8]);
-  out[18] = sub_round_shift(b[9]);
-  out[10] = sub_round_shift(b[10]);
-  out[26] = sub_round_shift(b[11]);
-  out[6] = sub_round_shift(b[12]);
-  out[22] = sub_round_shift(b[13]);
-  out[14] = sub_round_shift(b[14]);
-  out[30] = sub_round_shift(b[15]);
-
-  butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
-  out[1] = sub_round_shift(a[1]);
-  out[31] = sub_round_shift(a[31]);
-
-  butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
-  out[17] = sub_round_shift(a[17]);
-  out[15] = sub_round_shift(a[15]);
-
-  butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
-  out[9] = sub_round_shift(a[9]);
-  out[23] = sub_round_shift(a[23]);
-
-  butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
-  out[25] = sub_round_shift(a[25]);
-  out[7] = sub_round_shift(a[7]);
-
-  butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
-  out[5] = sub_round_shift(a[5]);
-  out[27] = sub_round_shift(a[27]);
-
-  butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
-  out[21] = sub_round_shift(a[21]);
-  out[11] = sub_round_shift(a[11]);
-
-  butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
-  out[13] = sub_round_shift(a[13]);
-  out[19] = sub_round_shift(a[19]);
-
-  butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
-  out[29] = sub_round_shift(a[29]);
-  out[3] = sub_round_shift(a[3]);
-}
-
-#define PASS_THROUGH(src, dst, element)    \
-  do {                                     \
-    dst##_lo[element] = src##_lo[element]; \
-    dst##_hi[element] = src##_hi[element]; \
-  } while (0)
-
-#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                        \
-    b##_lo[b_index] =                                                         \
-        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
-    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
-                                vget_high_s16(a[right_index]));               \
-  } while (0)
-
-#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                        \
-    b##_lo[b_index] =                                                         \
-        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
-    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
-                                vget_high_s16(a[right_index]));               \
-  } while (0)
-
-#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
-  do {                                                                       \
-    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
-    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
-  } while (0)
-
-#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
-  do {                                                                     \
-    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
-    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
-    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
-    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
-  } while (0)
-
-#define ADD_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                    \
-    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
-    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
-  } while (0)
-
-#define SUB_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                    \
-    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
-    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
-  } while (0)
-
-// Like butterfly_one_coeff, but don't narrow results.
-static INLINE void butterfly_one_coeff_s16_s32(
-    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
-    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
-    int32x4_t *sub_hi) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
-                              add_index, sub_index)                      \
-  do {                                                                   \
-    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
-                                &b##_lo[add_index], &b##_hi[add_index],  \
-                                &b##_lo[sub_index], &b##_hi[sub_index]); \
-  } while (0)
-
-// Like butterfly_one_coeff, but with s32.
-static INLINE void butterfly_one_coeff_s32(
-    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
-    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
-  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
-  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
-  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
-  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
-                          sub_index)                                          \
-  do {                                                                        \
-    butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index],           \
-                            a##_lo[right_index], a##_hi[right_index],         \
-                            constant, &b##_lo[add_index], &b##_hi[add_index], \
-                            &b##_lo[sub_index], &b##_hi[sub_index]);          \
-  } while (0)
-
-// Like butterfly_two_coeff, but with s32.
-static INLINE void butterfly_two_coeff_s32(
-    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
-    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
-    int32x4_t *sub_hi) {
-  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
-  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
-  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
-  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
-  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
-  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
-  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
-  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
-                          right_constant, b, add_index, sub_index)             \
-  do {                                                                         \
-    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
-                            a##_lo[right_index], a##_hi[right_index],          \
-                            left_constant, right_constant, &b##_lo[add_index], \
-                            &b##_hi[add_index], &b##_lo[sub_index],            \
-                            &b##_hi[sub_index]);                               \
-  } while (0)
-
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
-                                            const int32x4_t a_hi) {
-  const int32x4_t one = vdupq_n_s32(1);
-  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
-  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
-  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
-  const int16x4_t b_lo =
-      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
-  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
-  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
-  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
-  const int16x4_t b_hi =
-      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
-  return vcombine_s16(b_lo, b_hi);
-}
-
-static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-  int32x4_t c_lo[32];
-  int32x4_t c_hi[32];
-  int32x4_t d_lo[32];
-  int32x4_t d_hi[32];
-
-  // Stage 1. Done as part of the load for the first pass.
-  a[0] = vaddq_s16(in[0], in[31]);
-  a[1] = vaddq_s16(in[1], in[30]);
-  a[2] = vaddq_s16(in[2], in[29]);
-  a[3] = vaddq_s16(in[3], in[28]);
-  a[4] = vaddq_s16(in[4], in[27]);
-  a[5] = vaddq_s16(in[5], in[26]);
-  a[6] = vaddq_s16(in[6], in[25]);
-  a[7] = vaddq_s16(in[7], in[24]);
-  a[8] = vaddq_s16(in[8], in[23]);
-  a[9] = vaddq_s16(in[9], in[22]);
-  a[10] = vaddq_s16(in[10], in[21]);
-  a[11] = vaddq_s16(in[11], in[20]);
-  a[12] = vaddq_s16(in[12], in[19]);
-  a[13] = vaddq_s16(in[13], in[18]);
-  a[14] = vaddq_s16(in[14], in[17]);
-  a[15] = vaddq_s16(in[15], in[16]);
-  a[16] = vsubq_s16(in[15], in[16]);
-  a[17] = vsubq_s16(in[14], in[17]);
-  a[18] = vsubq_s16(in[13], in[18]);
-  a[19] = vsubq_s16(in[12], in[19]);
-  a[20] = vsubq_s16(in[11], in[20]);
-  a[21] = vsubq_s16(in[10], in[21]);
-  a[22] = vsubq_s16(in[9], in[22]);
-  a[23] = vsubq_s16(in[8], in[23]);
-  a[24] = vsubq_s16(in[7], in[24]);
-  a[25] = vsubq_s16(in[6], in[25]);
-  a[26] = vsubq_s16(in[5], in[26]);
-  a[27] = vsubq_s16(in[4], in[27]);
-  a[28] = vsubq_s16(in[3], in[28]);
-  a[29] = vsubq_s16(in[2], in[29]);
-  a[30] = vsubq_s16(in[1], in[30]);
-  a[31] = vsubq_s16(in[0], in[31]);
-
-  // Stage 2.
-  b[0] = vaddq_s16(a[0], a[15]);
-  b[1] = vaddq_s16(a[1], a[14]);
-  b[2] = vaddq_s16(a[2], a[13]);
-  b[3] = vaddq_s16(a[3], a[12]);
-  b[4] = vaddq_s16(a[4], a[11]);
-  b[5] = vaddq_s16(a[5], a[10]);
-  b[6] = vaddq_s16(a[6], a[9]);
-  b[7] = vaddq_s16(a[7], a[8]);
-
-  b[8] = vsubq_s16(a[7], a[8]);
-  b[9] = vsubq_s16(a[6], a[9]);
-  b[10] = vsubq_s16(a[5], a[10]);
-  b[11] = vsubq_s16(a[4], a[11]);
-  b[12] = vsubq_s16(a[3], a[12]);
-  b[13] = vsubq_s16(a[2], a[13]);
-  b[14] = vsubq_s16(a[1], a[14]);
-  b[15] = vsubq_s16(a[0], a[15]);
-
-  b[16] = a[16];
-  b[17] = a[17];
-  b[18] = a[18];
-  b[19] = a[19];
-
-  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
-  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
-  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
-  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-
-  b[28] = a[28];
-  b[29] = a[29];
-  b[30] = a[30];
-  b[31] = a[31];
-
-  // Stage 3. With extreme values for input this calculation rolls over int16_t.
-  // The sources for b[0] get added multiple times and, through testing, have
-  // been shown to overflow starting here.
-  ADD_S16_S32(b, 0, 7, c, 0);
-  ADD_S16_S32(b, 1, 6, c, 1);
-  ADD_S16_S32(b, 2, 5, c, 2);
-  ADD_S16_S32(b, 3, 4, c, 3);
-  SUB_S16_S32(b, 3, 4, c, 4);
-  SUB_S16_S32(b, 2, 5, c, 5);
-  SUB_S16_S32(b, 1, 6, c, 6);
-  SUB_S16_S32(b, 0, 7, c, 7);
-
-  a[8] = b[8];
-  a[9] = b[9];
-
-  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
-  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
-
-  a[14] = b[14];
-  a[15] = b[15];
-
-  ADD_S16_S32(b, 16, 23, c, 16);
-  ADD_S16_S32(b, 17, 22, c, 17);
-  ADD_S16_S32(b, 18, 21, c, 18);
-  ADD_S16_S32(b, 19, 20, c, 19);
-  SUB_S16_S32(b, 19, 20, c, 20);
-  SUB_S16_S32(b, 18, 21, c, 21);
-  SUB_S16_S32(b, 17, 22, c, 22);
-  SUB_S16_S32(b, 16, 23, c, 23);
-  SUB_S16_S32(b, 31, 24, c, 24);
-  SUB_S16_S32(b, 30, 25, c, 25);
-  SUB_S16_S32(b, 29, 26, c, 26);
-  SUB_S16_S32(b, 28, 27, c, 27);
-  ADD_S16_S32(b, 28, 27, c, 28);
-  ADD_S16_S32(b, 29, 26, c, 29);
-  ADD_S16_S32(b, 30, 25, c, 30);
-  ADD_S16_S32(b, 31, 24, c, 31);
-
-  // Stage 4.
-  ADD_S32(c, 0, 3, d, 0);
-  ADD_S32(c, 1, 2, d, 1);
-  SUB_S32(c, 1, 2, d, 2);
-  SUB_S32(c, 0, 3, d, 3);
-
-  PASS_THROUGH(c, d, 4);
-
-  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
-
-  PASS_THROUGH(c, d, 7);
-
-  ADDW_S16_S32(c, 11, a, 8, d, 8);
-  ADDW_S16_S32(c, 10, a, 9, d, 9);
-  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
-  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
-  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
-  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
-  ADDW_S16_S32(c, 13, b, 14, d, 14);
-  ADDW_S16_S32(c, 12, b, 15, d, 15);
-
-  PASS_THROUGH(c, d, 16);
-  PASS_THROUGH(c, d, 17);
-
-  BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
-  BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
-  BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
-  BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
-
-  PASS_THROUGH(c, d, 22);
-  PASS_THROUGH(c, d, 23);
-  PASS_THROUGH(c, d, 24);
-  PASS_THROUGH(c, d, 25);
-
-  PASS_THROUGH(c, d, 30);
-  PASS_THROUGH(c, d, 31);
-
-  // Stage 5.
-  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
-  BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
-
-  ADD_S32(d, 4, 5, c, 4);
-  SUB_S32(d, 4, 5, c, 5);
-  SUB_S32(d, 7, 6, c, 6);
-  ADD_S32(d, 7, 6, c, 7);
-
-  PASS_THROUGH(d, c, 8);
-
-  BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
-  BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
-
-  PASS_THROUGH(d, c, 11);
-  PASS_THROUGH(d, c, 12);
-  PASS_THROUGH(d, c, 15);
-
-  ADD_S32(d, 16, 19, c, 16);
-  ADD_S32(d, 17, 18, c, 17);
-  SUB_S32(d, 17, 18, c, 18);
-  SUB_S32(d, 16, 19, c, 19);
-  SUB_S32(d, 23, 20, c, 20);
-  SUB_S32(d, 22, 21, c, 21);
-  ADD_S32(d, 22, 21, c, 22);
-  ADD_S32(d, 23, 20, c, 23);
-  ADD_S32(d, 24, 27, c, 24);
-  ADD_S32(d, 25, 26, c, 25);
-  SUB_S32(d, 25, 26, c, 26);
-  SUB_S32(d, 24, 27, c, 27);
-  SUB_S32(d, 31, 28, c, 28);
-  SUB_S32(d, 30, 29, c, 29);
-  ADD_S32(d, 30, 29, c, 30);
-  ADD_S32(d, 31, 28, c, 31);
-
-  // Stage 6.
-  PASS_THROUGH(c, d, 0);
-  PASS_THROUGH(c, d, 1);
-  PASS_THROUGH(c, d, 2);
-  PASS_THROUGH(c, d, 3);
-
-  BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
-  BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
-
-  ADD_S32(c, 8, 9, d, 8);
-  SUB_S32(c, 8, 9, d, 9);
-  SUB_S32(c, 11, 10, d, 10);
-  ADD_S32(c, 11, 10, d, 11);
-  ADD_S32(c, 12, 13, d, 12);
-  SUB_S32(c, 12, 13, d, 13);
-  SUB_S32(c, 15, 14, d, 14);
-  ADD_S32(c, 15, 14, d, 15);
-
-  PASS_THROUGH(c, d, 16);
-  PASS_THROUGH(c, d, 19);
-  PASS_THROUGH(c, d, 20);
-  PASS_THROUGH(c, d, 23);
-  PASS_THROUGH(c, d, 24);
-  PASS_THROUGH(c, d, 27);
-  PASS_THROUGH(c, d, 28);
-  PASS_THROUGH(c, d, 31);
-
-  BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
-  BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
-  BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
-  BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
-
-  // Stage 7.
-  PASS_THROUGH(d, c, 0);
-  PASS_THROUGH(d, c, 1);
-  PASS_THROUGH(d, c, 2);
-  PASS_THROUGH(d, c, 3);
-  PASS_THROUGH(d, c, 4);
-  PASS_THROUGH(d, c, 5);
-  PASS_THROUGH(d, c, 6);
-  PASS_THROUGH(d, c, 7);
-
-  BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
-  BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
-  BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
-  BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
-
-  ADD_S32(d, 16, 17, c, 16);
-  SUB_S32(d, 16, 17, c, 17);
-  SUB_S32(d, 19, 18, c, 18);
-  ADD_S32(d, 19, 18, c, 19);
-  ADD_S32(d, 20, 21, c, 20);
-  SUB_S32(d, 20, 21, c, 21);
-  SUB_S32(d, 23, 22, c, 22);
-  ADD_S32(d, 23, 22, c, 23);
-  ADD_S32(d, 24, 25, c, 24);
-  SUB_S32(d, 24, 25, c, 25);
-  SUB_S32(d, 27, 26, c, 26);
-  ADD_S32(d, 27, 26, c, 27);
-  ADD_S32(d, 28, 29, c, 28);
-  SUB_S32(d, 28, 29, c, 29);
-  SUB_S32(d, 31, 30, c, 30);
-  ADD_S32(d, 31, 30, c, 31);
-
-  // Final stage.
-  // Roll rounding into this function so we can pass back int16x8.
-
-  out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
-  out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
-
-  out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
-  out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
-  out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
-  out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
-  out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
-
-  out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
-  out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
-  out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
-  out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
-
-  out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
-  out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
-  out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
-  out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
-  out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
-
-  BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
-  out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
-  out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
-
-  BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
-  out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
-  out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
-
-  BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
-  out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
-  out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
-
-  BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
-  out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
-  out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
-
-  BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
-  out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
-  out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
-
-  BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
-  out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
-  out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
-
-  BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
-  out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
-  out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
-
-  BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
-  out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
-  out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
-}
-
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
-  const int16x8_t one = vdupq_n_s16(1);
-  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
-  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
-  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
-  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
-}
-
-static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-
-  // Stage 1. Done as part of the load for the first pass.
-  a[0] = vaddq_s16(in[0], in[31]);
-  a[1] = vaddq_s16(in[1], in[30]);
-  a[2] = vaddq_s16(in[2], in[29]);
-  a[3] = vaddq_s16(in[3], in[28]);
-  a[4] = vaddq_s16(in[4], in[27]);
-  a[5] = vaddq_s16(in[5], in[26]);
-  a[6] = vaddq_s16(in[6], in[25]);
-  a[7] = vaddq_s16(in[7], in[24]);
-  a[8] = vaddq_s16(in[8], in[23]);
-  a[9] = vaddq_s16(in[9], in[22]);
-  a[10] = vaddq_s16(in[10], in[21]);
-  a[11] = vaddq_s16(in[11], in[20]);
-  a[12] = vaddq_s16(in[12], in[19]);
-  a[13] = vaddq_s16(in[13], in[18]);
-  a[14] = vaddq_s16(in[14], in[17]);
-  a[15] = vaddq_s16(in[15], in[16]);
-  a[16] = vsubq_s16(in[15], in[16]);
-  a[17] = vsubq_s16(in[14], in[17]);
-  a[18] = vsubq_s16(in[13], in[18]);
-  a[19] = vsubq_s16(in[12], in[19]);
-  a[20] = vsubq_s16(in[11], in[20]);
-  a[21] = vsubq_s16(in[10], in[21]);
-  a[22] = vsubq_s16(in[9], in[22]);
-  a[23] = vsubq_s16(in[8], in[23]);
-  a[24] = vsubq_s16(in[7], in[24]);
-  a[25] = vsubq_s16(in[6], in[25]);
-  a[26] = vsubq_s16(in[5], in[26]);
-  a[27] = vsubq_s16(in[4], in[27]);
-  a[28] = vsubq_s16(in[3], in[28]);
-  a[29] = vsubq_s16(in[2], in[29]);
-  a[30] = vsubq_s16(in[1], in[30]);
-  a[31] = vsubq_s16(in[0], in[31]);
-
-  // Stage 2.
-  // For the "rd" version, all the values are rounded down after stage 2 to keep
-  // the values in 16 bits.
-  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
-  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
-  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
-  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
-  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
-  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
-  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
-  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
-
-  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
-  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
-  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
-  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
-  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
-  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
-  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
-  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
-
-  b[16] = add_round_shift_s16(a[16]);
-  b[17] = add_round_shift_s16(a[17]);
-  b[18] = add_round_shift_s16(a[18]);
-  b[19] = add_round_shift_s16(a[19]);
-
-  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
-  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
-  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
-  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-  b[20] = add_round_shift_s16(b[20]);
-  b[21] = add_round_shift_s16(b[21]);
-  b[22] = add_round_shift_s16(b[22]);
-  b[23] = add_round_shift_s16(b[23]);
-  b[24] = add_round_shift_s16(b[24]);
-  b[25] = add_round_shift_s16(b[25]);
-  b[26] = add_round_shift_s16(b[26]);
-  b[27] = add_round_shift_s16(b[27]);
-
-  b[28] = add_round_shift_s16(a[28]);
-  b[29] = add_round_shift_s16(a[29]);
-  b[30] = add_round_shift_s16(a[30]);
-  b[31] = add_round_shift_s16(a[31]);
-
-  // Stage 3.
-  a[0] = vaddq_s16(b[0], b[7]);
-  a[1] = vaddq_s16(b[1], b[6]);
-  a[2] = vaddq_s16(b[2], b[5]);
-  a[3] = vaddq_s16(b[3], b[4]);
-
-  a[4] = vsubq_s16(b[3], b[4]);
-  a[5] = vsubq_s16(b[2], b[5]);
-  a[6] = vsubq_s16(b[1], b[6]);
-  a[7] = vsubq_s16(b[0], b[7]);
-
-  a[8] = b[8];
-  a[9] = b[9];
-
-  butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
-  butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
-
-  a[14] = b[14];
-  a[15] = b[15];
-
-  a[16] = vaddq_s16(b[16], b[23]);
-  a[17] = vaddq_s16(b[17], b[22]);
-  a[18] = vaddq_s16(b[18], b[21]);
-  a[19] = vaddq_s16(b[19], b[20]);
-
-  a[20] = vsubq_s16(b[19], b[20]);
-  a[21] = vsubq_s16(b[18], b[21]);
-  a[22] = vsubq_s16(b[17], b[22]);
-  a[23] = vsubq_s16(b[16], b[23]);
-
-  a[24] = vsubq_s16(b[31], b[24]);
-  a[25] = vsubq_s16(b[30], b[25]);
-  a[26] = vsubq_s16(b[29], b[26]);
-  a[27] = vsubq_s16(b[28], b[27]);
-
-  a[28] = vaddq_s16(b[28], b[27]);
-  a[29] = vaddq_s16(b[29], b[26]);
-  a[30] = vaddq_s16(b[30], b[25]);
-  a[31] = vaddq_s16(b[31], b[24]);
-
-  // Stage 4.
-  b[0] = vaddq_s16(a[0], a[3]);
-  b[1] = vaddq_s16(a[1], a[2]);
-  b[2] = vsubq_s16(a[1], a[2]);
-  b[3] = vsubq_s16(a[0], a[3]);
-
-  b[4] = a[4];
-
-  butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
-
-  b[7] = a[7];
-
-  b[8] = vaddq_s16(a[8], a[11]);
-  b[9] = vaddq_s16(a[9], a[10]);
-  b[10] = vsubq_s16(a[9], a[10]);
-  b[11] = vsubq_s16(a[8], a[11]);
-  b[12] = vsubq_s16(a[15], a[12]);
-  b[13] = vsubq_s16(a[14], a[13]);
-  b[14] = vaddq_s16(a[14], a[13]);
-  b[15] = vaddq_s16(a[15], a[12]);
-
-  b[16] = a[16];
-  b[17] = a[17];
-
-  butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
-  butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
-  butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
-  butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
-
-  b[22] = a[22];
-  b[23] = a[23];
-  b[24] = a[24];
-  b[25] = a[25];
-
-  b[30] = a[30];
-  b[31] = a[31];
-
-  // Stage 5.
-  butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
-  butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
-
-  a[4] = vaddq_s16(b[4], b[5]);
-  a[5] = vsubq_s16(b[4], b[5]);
-  a[6] = vsubq_s16(b[7], b[6]);
-  a[7] = vaddq_s16(b[7], b[6]);
-
-  a[8] = b[8];
-
-  butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
-  butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
-
-  a[11] = b[11];
-  a[12] = b[12];
-
-  a[15] = b[15];
-
-  a[16] = vaddq_s16(b[19], b[16]);
-  a[17] = vaddq_s16(b[18], b[17]);
-  a[18] = vsubq_s16(b[17], b[18]);
-  a[19] = vsubq_s16(b[16], b[19]);
-  a[20] = vsubq_s16(b[23], b[20]);
-  a[21] = vsubq_s16(b[22], b[21]);
-  a[22] = vaddq_s16(b[21], b[22]);
-  a[23] = vaddq_s16(b[20], b[23]);
-  a[24] = vaddq_s16(b[27], b[24]);
-  a[25] = vaddq_s16(b[26], b[25]);
-  a[26] = vsubq_s16(b[25], b[26]);
-  a[27] = vsubq_s16(b[24], b[27]);
-  a[28] = vsubq_s16(b[31], b[28]);
-  a[29] = vsubq_s16(b[30], b[29]);
-  a[30] = vaddq_s16(b[29], b[30]);
-  a[31] = vaddq_s16(b[28], b[31]);
-
-  // Stage 6.
-  b[0] = a[0];
-  b[1] = a[1];
-  b[2] = a[2];
-  b[3] = a[3];
-
-  butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
-  butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
-
-  b[8] = vaddq_s16(a[8], a[9]);
-  b[9] = vsubq_s16(a[8], a[9]);
-  b[10] = vsubq_s16(a[11], a[10]);
-  b[11] = vaddq_s16(a[11], a[10]);
-  b[12] = vaddq_s16(a[12], a[13]);
-  b[13] = vsubq_s16(a[12], a[13]);
-  b[14] = vsubq_s16(a[15], a[14]);
-  b[15] = vaddq_s16(a[15], a[14]);
-
-  b[16] = a[16];
-  b[19] = a[19];
-  b[20] = a[20];
-  b[23] = a[23];
-  b[24] = a[24];
-  b[27] = a[27];
-  b[28] = a[28];
-  b[31] = a[31];
-
-  butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
-  butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
-
-  butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
-  butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
-
-  // Stage 7.
-  a[0] = b[0];
-  a[1] = b[1];
-  a[2] = b[2];
-  a[3] = b[3];
-  a[4] = b[4];
-  a[5] = b[5];
-  a[6] = b[6];
-  a[7] = b[7];
-
-  butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
-  butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
-  butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
-  butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
-
-  a[16] = vaddq_s16(b[16], b[17]);
-  a[17] = vsubq_s16(b[16], b[17]);
-  a[18] = vsubq_s16(b[19], b[18]);
-  a[19] = vaddq_s16(b[19], b[18]);
-  a[20] = vaddq_s16(b[20], b[21]);
-  a[21] = vsubq_s16(b[20], b[21]);
-  a[22] = vsubq_s16(b[23], b[22]);
-  a[23] = vaddq_s16(b[23], b[22]);
-  a[24] = vaddq_s16(b[24], b[25]);
-  a[25] = vsubq_s16(b[24], b[25]);
-  a[26] = vsubq_s16(b[27], b[26]);
-  a[27] = vaddq_s16(b[27], b[26]);
-  a[28] = vaddq_s16(b[28], b[29]);
-  a[29] = vsubq_s16(b[28], b[29]);
-  a[30] = vsubq_s16(b[31], b[30]);
-  a[31] = vaddq_s16(b[31], b[30]);
-
-  // Final stage.
-  out[0] = a[0];
-  out[16] = a[1];
-  out[8] = a[2];
-  out[24] = a[3];
-  out[4] = a[4];
-  out[20] = a[5];
-  out[12] = a[6];
-  out[28] = a[7];
-  out[2] = a[8];
-  out[18] = a[9];
-  out[10] = a[10];
-  out[26] = a[11];
-  out[6] = a[12];
-  out[22] = a[13];
-  out[14] = a[14];
-  out[30] = a[15];
-
-  butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
-  butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
-                      &out[15]);
-  butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
-  butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
-  butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
-  butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
-                      &out[11]);
-  butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
-                      &out[19]);
-  butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
-}
-
-#undef PASS_THROUGH
-#undef ADD_S16_S32
-#undef SUB_S16_S32
-#undef ADDW_S16_S32
-#undef SUBW_S16_S32
-#undef ADD_S32
-#undef SUB_S32
-#undef BUTTERFLY_ONE_S16_S32
-#undef BUTTERFLY_ONE_S32
-#undef BUTTERFLY_TWO_S32
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-// TODO(johannkoenig): share with other fdcts.
-static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[32];
   int16x8_t temp1[32];
@@ -1324,23 +43,27 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp5[32];
 
   // Process in 8x32 columns.
-  load(input, stride, temp0);
-  dct_body_first_pass(temp0, temp1);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
 
-  load(input + 8, stride, temp0);
-  dct_body_first_pass(temp0, temp2);
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
 
-  load(input + 16, stride, temp0);
-  dct_body_first_pass(temp0, temp3);
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
 
-  load(input + 24, stride, temp0);
-  dct_body_first_pass(temp0, temp4);
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_8x8(&temp1[0], &temp0[0]);
-  transpose_8x8(&temp2[0], &temp0[8]);
-  transpose_8x8(&temp3[0], &temp0[16]);
-  transpose_8x8(&temp4[0], &temp0[24]);
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1355,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_8x8(&temp1[8], &temp0[0]);
-  transpose_8x8(&temp2[8], &temp0[8]);
-  transpose_8x8(&temp3[8], &temp0[16]);
-  transpose_8x8(&temp4[8], &temp0[24]);
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1373,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_8x8(&temp1[16], &temp0[0]);
-  transpose_8x8(&temp2[16], &temp0[8]);
-  transpose_8x8(&temp3[16], &temp0[16]);
-  transpose_8x8(&temp4[16], &temp0[24]);
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1391,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_8x8(&temp1[24], &temp0[0]);
-  transpose_8x8(&temp2[24], &temp0[8]);
-  transpose_8x8(&temp3[24], &temp0[16]);
-  transpose_8x8(&temp4[24], &temp0[24]);
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1419,23 +142,27 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   int16x8_t temp5[32];
 
   // Process in 8x32 columns.
-  load(input, stride, temp0);
-  dct_body_first_pass(temp0, temp1);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
 
-  load(input + 8, stride, temp0);
-  dct_body_first_pass(temp0, temp2);
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
 
-  load(input + 16, stride, temp0);
-  dct_body_first_pass(temp0, temp3);
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
 
-  load(input + 24, stride, temp0);
-  dct_body_first_pass(temp0, temp4);
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_8x8(&temp1[0], &temp0[0]);
-  transpose_8x8(&temp2[0], &temp0[8]);
-  transpose_8x8(&temp3[0], &temp0[16]);
-  transpose_8x8(&temp4[0], &temp0[24]);
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1450,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_8x8(&temp1[8], &temp0[0]);
-  transpose_8x8(&temp2[8], &temp0[8]);
-  transpose_8x8(&temp3[8], &temp0[16]);
-  transpose_8x8(&temp4[8], &temp0[24]);
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1468,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_8x8(&temp1[16], &temp0[0]);
-  transpose_8x8(&temp2[16], &temp0[8]);
-  transpose_8x8(&temp3[16], &temp0[16]);
-  transpose_8x8(&temp4[16], &temp0[24]);
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1486,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_8x8(&temp1[24], &temp0[0]);
-  transpose_8x8(&temp2[24], &temp0[8]);
-  transpose_8x8(&temp3[24], &temp0[16]);
-  transpose_8x8(&temp4[24], &temp0[24]);
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1503,5 +230,190 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
                     &temp5[29], &temp5[30], &temp5[31]);
   store(output + 24 * 32, temp5);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  int16x8_t temp0[32];
+  int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+      right3[32], right4[32];
+  int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+      left8[32], right8[32];
+  int32x4_t temp1[32], temp2[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  highbd_dct8x32_body_first_pass(left1, right1);
+  highbd_partial_sub_round_shift(left1, right1);
+
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  highbd_dct8x32_body_first_pass(left2, right2);
+  highbd_partial_sub_round_shift(left2, right2);
+
+  load_cross(input + 16, stride, temp0);
+  highbd_scale_input(temp0, left3, right3);
+  highbd_dct8x32_body_first_pass(left3, right3);
+  highbd_partial_sub_round_shift(left3, right3);
+
+  load_cross(input + 24, stride, temp0);
+  highbd_scale_input(temp0, left4, right4);
+  highbd_dct8x32_body_first_pass(left4, right4);
+  highbd_partial_sub_round_shift(left4, right4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s32_8x8_2(left1, right1, temp1, temp2);
+  transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left5, right5);
+  highbd_dct8x32_body_second_pass(left5, right5);
+  highbd_partial_add_round_shift(left5, right5);
+
+  // Second row of 8x32.
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left6, right6);
+  highbd_dct8x32_body_second_pass(left6, right6);
+  highbd_partial_add_round_shift(left6, right6);
+
+  // Third row of 8x32
+  transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left7, right7);
+  highbd_dct8x32_body_second_pass(left7, right7);
+  highbd_partial_add_round_shift(left7, right7);
+
+  // Final row of 8x32.
+  transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left8, right8);
+  highbd_dct8x32_body_second_pass(left8, right8);
+  highbd_partial_add_round_shift(left8, right8);
+
+  // Final transpose
+  transpose_s32_8x8_2(left5, right5, left1, right1);
+  transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+  transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+  transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+  transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+  transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+  transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+  transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+  transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+  transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+  transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+  transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+  transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+  transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+  store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+                 right4);
+}
+
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                                  int stride) {
+  int16x8_t temp0[32];
+  int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+      right3[32], right4[32];
+  int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+      left8[32], right8[32];
+  int32x4_t temp1[32], temp2[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  highbd_dct8x32_body_first_pass(left1, right1);
+  highbd_partial_sub_round_shift(left1, right1);
+
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  highbd_dct8x32_body_first_pass(left2, right2);
+  highbd_partial_sub_round_shift(left2, right2);
+
+  load_cross(input + 16, stride, temp0);
+  highbd_scale_input(temp0, left3, right3);
+  highbd_dct8x32_body_first_pass(left3, right3);
+  highbd_partial_sub_round_shift(left3, right3);
+
+  load_cross(input + 24, stride, temp0);
+  highbd_scale_input(temp0, left4, right4);
+  highbd_dct8x32_body_first_pass(left4, right4);
+  highbd_partial_sub_round_shift(left4, right4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s32_8x8_2(left1, right1, temp1, temp2);
+  transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left5, right5);
+  highbd_dct8x32_body_second_pass_rd(left5, right5);
+
+  // Second row of 8x32.
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left6, right6);
+  highbd_dct8x32_body_second_pass_rd(left6, right6);
+
+  // Third row of 8x32
+  transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left7, right7);
+  highbd_dct8x32_body_second_pass_rd(left7, right7);
+
+  // Final row of 8x32.
+  transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left8, right8);
+  highbd_dct8x32_body_second_pass_rd(left8, right8);
+
+  // Final transpose
+  transpose_s32_8x8_2(left5, right5, left1, right1);
+  transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+  transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+  transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+  transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+  transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+  transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+  transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+  transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+  transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+  transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+  transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+  transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+  transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+  store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+                 right4);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
         // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/vpx_dsp/arm/fdct32x32_neon.h b/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 0000000000..3b9e64c6df
--- /dev/null
+++ b/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,2919 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+  b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+  b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+  b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+  b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+  b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+  b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+  b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+  b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest)           \
+  do {                                        \
+    store_s16q_to_tran_low(dest, src[index]); \
+    dest += 8;                                \
+  } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  STORE_S16(b, 0, a);
+  STORE_S16(b, 8, a);
+  STORE_S16(b, 16, a);
+  STORE_S16(b, 24, a);
+  STORE_S16(b, 1, a);
+  STORE_S16(b, 9, a);
+  STORE_S16(b, 17, a);
+  STORE_S16(b, 25, a);
+  STORE_S16(b, 2, a);
+  STORE_S16(b, 10, a);
+  STORE_S16(b, 18, a);
+  STORE_S16(b, 26, a);
+  STORE_S16(b, 3, a);
+  STORE_S16(b, 11, a);
+  STORE_S16(b, 19, a);
+  STORE_S16(b, 27, a);
+  STORE_S16(b, 4, a);
+  STORE_S16(b, 12, a);
+  STORE_S16(b, 20, a);
+  STORE_S16(b, 28, a);
+  STORE_S16(b, 5, a);
+  STORE_S16(b, 13, a);
+  STORE_S16(b, 21, a);
+  STORE_S16(b, 29, a);
+  STORE_S16(b, 6, a);
+  STORE_S16(b, 14, a);
+  STORE_S16(b, 22, a);
+  STORE_S16(b, 30, a);
+  STORE_S16(b, 7, a);
+  STORE_S16(b, 15, a);
+  STORE_S16(b, 23, a);
+  STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+                               int16x8_t *out /*32*/) {
+  out[0] = vshlq_n_s16(in[0], 2);
+  out[1] = vshlq_n_s16(in[1], 2);
+  out[2] = vshlq_n_s16(in[2], 2);
+  out[3] = vshlq_n_s16(in[3], 2);
+  out[4] = vshlq_n_s16(in[4], 2);
+  out[5] = vshlq_n_s16(in[5], 2);
+  out[6] = vshlq_n_s16(in[6], 2);
+  out[7] = vshlq_n_s16(in[7], 2);
+
+  out[8] = vshlq_n_s16(in[8], 2);
+  out[9] = vshlq_n_s16(in[9], 2);
+  out[10] = vshlq_n_s16(in[10], 2);
+  out[11] = vshlq_n_s16(in[11], 2);
+  out[12] = vshlq_n_s16(in[12], 2);
+  out[13] = vshlq_n_s16(in[13], 2);
+  out[14] = vshlq_n_s16(in[14], 2);
+  out[15] = vshlq_n_s16(in[15], 2);
+
+  out[16] = vshlq_n_s16(in[16], 2);
+  out[17] = vshlq_n_s16(in[17], 2);
+  out[18] = vshlq_n_s16(in[18], 2);
+  out[19] = vshlq_n_s16(in[19], 2);
+  out[20] = vshlq_n_s16(in[20], 2);
+  out[21] = vshlq_n_s16(in[21], 2);
+  out[22] = vshlq_n_s16(in[22], 2);
+  out[23] = vshlq_n_s16(in[23], 2);
+
+  out[24] = vshlq_n_s16(in[24], 2);
+  out[25] = vshlq_n_s16(in[25], 2);
+  out[26] = vshlq_n_s16(in[26], 2);
+  out[27] = vshlq_n_s16(in[27], 2);
+  out[28] = vshlq_n_s16(in[28], 2);
+  out[29] = vshlq_n_s16(in[29], 2);
+  out[30] = vshlq_n_s16(in[30], 2);
+  out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  a[0] = vaddq_s16(in[0], in[15]);
+  a[1] = vaddq_s16(in[1], in[14]);
+  a[2] = vaddq_s16(in[2], in[13]);
+  a[3] = vaddq_s16(in[3], in[12]);
+  a[4] = vaddq_s16(in[4], in[11]);
+  a[5] = vaddq_s16(in[5], in[10]);
+  a[6] = vaddq_s16(in[6], in[9]);
+  a[7] = vaddq_s16(in[7], in[8]);
+
+  a[8] = vsubq_s16(in[7], in[8]);
+  a[9] = vsubq_s16(in[6], in[9]);
+  a[10] = vsubq_s16(in[5], in[10]);
+  a[11] = vsubq_s16(in[4], in[11]);
+  a[12] = vsubq_s16(in[3], in[12]);
+  a[13] = vsubq_s16(in[2], in[13]);
+  a[14] = vsubq_s16(in[1], in[14]);
+  a[15] = vsubq_s16(in[0], in[15]);
+
+  a[16] = in[16];
+  a[17] = in[17];
+  a[18] = in[18];
+  a[19] = in[19];
+
+  butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+                                     &a[20]);
+  butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+                                     &a[21]);
+  butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+                                     &a[22]);
+  butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+                                     &a[23]);
+
+  a[28] = in[28];
+  a[29] = in[29];
+  a[30] = in[30];
+  a[31] = in[31];
+
+  // Stage 3.
+  b[0] = vaddq_s16(a[0], a[7]);
+  b[1] = vaddq_s16(a[1], a[6]);
+  b[2] = vaddq_s16(a[2], a[5]);
+  b[3] = vaddq_s16(a[3], a[4]);
+
+  b[4] = vsubq_s16(a[3], a[4]);
+  b[5] = vsubq_s16(a[2], a[5]);
+  b[6] = vsubq_s16(a[1], a[6]);
+  b[7] = vsubq_s16(a[0], a[7]);
+
+  b[8] = a[8];
+  b[9] = a[9];
+
+  butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+  butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+  b[14] = a[14];
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(in[16], a[23]);
+  b[17] = vaddq_s16(in[17], a[22]);
+  b[18] = vaddq_s16(in[18], a[21]);
+  b[19] = vaddq_s16(in[19], a[20]);
+
+  b[20] = vsubq_s16(in[19], a[20]);
+  b[21] = vsubq_s16(in[18], a[21]);
+  b[22] = vsubq_s16(in[17], a[22]);
+  b[23] = vsubq_s16(in[16], a[23]);
+
+  b[24] = vsubq_s16(in[31], a[24]);
+  b[25] = vsubq_s16(in[30], a[25]);
+  b[26] = vsubq_s16(in[29], a[26]);
+  b[27] = vsubq_s16(in[28], a[27]);
+
+  b[28] = vaddq_s16(in[28], a[27]);
+  b[29] = vaddq_s16(in[29], a[26]);
+  b[30] = vaddq_s16(in[30], a[25]);
+  b[31] = vaddq_s16(in[31], a[24]);
+
+  // Stage 4.
+  a[0] = vaddq_s16(b[0], b[3]);
+  a[1] = vaddq_s16(b[1], b[2]);
+  a[2] = vsubq_s16(b[1], b[2]);
+  a[3] = vsubq_s16(b[0], b[3]);
+
+  a[4] = b[4];
+
+  butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+  a[7] = b[7];
+
+  a[8] = vaddq_s16(b[8], b[11]);
+  a[9] = vaddq_s16(b[9], b[10]);
+  a[10] = vsubq_s16(b[9], b[10]);
+  a[11] = vsubq_s16(b[8], b[11]);
+  a[12] = vsubq_s16(b[15], b[12]);
+  a[13] = vsubq_s16(b[14], b[13]);
+  a[14] = vaddq_s16(b[14], b[13]);
+  a[15] = vaddq_s16(b[15], b[12]);
+
+  a[16] = b[16];
+  a[17] = b[17];
+
+  butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+  butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+  butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+  butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+  a[22] = b[22];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[25] = b[25];
+
+  a[30] = b[30];
+  a[31] = b[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+  butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+  b[4] = vaddq_s16(a[4], a[5]);
+  b[5] = vsubq_s16(a[4], a[5]);
+  b[6] = vsubq_s16(a[7], a[6]);
+  b[7] = vaddq_s16(a[7], a[6]);
+
+  b[8] = a[8];
+
+  butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+  butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+  b[11] = a[11];
+  b[12] = a[12];
+
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(a[19], a[16]);
+  b[17] = vaddq_s16(a[18], a[17]);
+  b[18] = vsubq_s16(a[17], a[18]);
+  b[19] = vsubq_s16(a[16], a[19]);
+  b[20] = vsubq_s16(a[23], a[20]);
+  b[21] = vsubq_s16(a[22], a[21]);
+  b[22] = vaddq_s16(a[21], a[22]);
+  b[23] = vaddq_s16(a[20], a[23]);
+  b[24] = vaddq_s16(a[27], a[24]);
+  b[25] = vaddq_s16(a[26], a[25]);
+  b[26] = vsubq_s16(a[25], a[26]);
+  b[27] = vsubq_s16(a[24], a[27]);
+  b[28] = vsubq_s16(a[31], a[28]);
+  b[29] = vsubq_s16(a[30], a[29]);
+  b[30] = vaddq_s16(a[29], a[30]);
+  b[31] = vaddq_s16(a[28], a[31]);
+
+  // Stage 6.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+  butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+  butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+  a[8] = vaddq_s16(b[8], b[9]);
+  a[9] = vsubq_s16(b[8], b[9]);
+  a[10] = vsubq_s16(b[11], b[10]);
+  a[11] = vaddq_s16(b[11], b[10]);
+  a[12] = vaddq_s16(b[12], b[13]);
+  a[13] = vsubq_s16(b[12], b[13]);
+  a[14] = vsubq_s16(b[15], b[14]);
+  a[15] = vaddq_s16(b[15], b[14]);
+
+  a[16] = b[16];
+  a[19] = b[19];
+  a[20] = b[20];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[27] = b[27];
+  a[28] = b[28];
+  a[31] = b[31];
+
+  butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+  butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+  butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+  // Stage 7.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+  b[4] = a[4];
+  b[5] = a[5];
+  b[6] = a[6];
+  b[7] = a[7];
+
+  butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+  butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+  butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+  butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+  b[16] = vaddq_s16(a[16], a[17]);
+  b[17] = vsubq_s16(a[16], a[17]);
+  b[18] = vsubq_s16(a[19], a[18]);
+  b[19] = vaddq_s16(a[19], a[18]);
+  b[20] = vaddq_s16(a[20], a[21]);
+  b[21] = vsubq_s16(a[20], a[21]);
+  b[22] = vsubq_s16(a[23], a[22]);
+  b[23] = vaddq_s16(a[23], a[22]);
+  b[24] = vaddq_s16(a[24], a[25]);
+  b[25] = vsubq_s16(a[24], a[25]);
+  b[26] = vsubq_s16(a[27], a[26]);
+  b[27] = vaddq_s16(a[27], a[26]);
+  b[28] = vaddq_s16(a[28], a[29]);
+  b[29] = vsubq_s16(a[28], a[29]);
+  b[30] = vsubq_s16(a[31], a[30]);
+  b[31] = vaddq_s16(a[31], a[30]);
+
+  // Final stage.
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  out[0] = sub_round_shift_s16(b[0]);
+  out[16] = sub_round_shift_s16(b[1]);
+  out[8] = sub_round_shift_s16(b[2]);
+  out[24] = sub_round_shift_s16(b[3]);
+  out[4] = sub_round_shift_s16(b[4]);
+  out[20] = sub_round_shift_s16(b[5]);
+  out[12] = sub_round_shift_s16(b[6]);
+  out[28] = sub_round_shift_s16(b[7]);
+  out[2] = sub_round_shift_s16(b[8]);
+  out[18] = sub_round_shift_s16(b[9]);
+  out[10] = sub_round_shift_s16(b[10]);
+  out[26] = sub_round_shift_s16(b[11]);
+  out[6] = sub_round_shift_s16(b[12]);
+  out[22] = sub_round_shift_s16(b[13]);
+  out[14] = sub_round_shift_s16(b[14]);
+  out[30] = sub_round_shift_s16(b[15]);
+
+  butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+  out[1] = sub_round_shift_s16(a[1]);
+  out[31] = sub_round_shift_s16(a[31]);
+
+  butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+  out[17] = sub_round_shift_s16(a[17]);
+  out[15] = sub_round_shift_s16(a[15]);
+
+  butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+  out[9] = sub_round_shift_s16(a[9]);
+  out[23] = sub_round_shift_s16(a[23]);
+
+  butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+  out[25] = sub_round_shift_s16(a[25]);
+  out[7] = sub_round_shift_s16(a[7]);
+
+  butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+  out[5] = sub_round_shift_s16(a[5]);
+  out[27] = sub_round_shift_s16(a[27]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+  out[21] = sub_round_shift_s16(a[21]);
+  out[11] = sub_round_shift_s16(a[11]);
+
+  butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+  out[13] = sub_round_shift_s16(a[13]);
+  out[19] = sub_round_shift_s16(a[19]);
+
+  butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+  out[29] = sub_round_shift_s16(a[29]);
+  out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element)    \
+  do {                                     \
+    dst##_lo[element] = src##_lo[element]; \
+    dst##_hi[element] = src##_hi[element]; \
+  } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
+  do {                                                                       \
+    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
+    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+  } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+  do {                                                                     \
+    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
+    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
+    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
+    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
+  } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
+                              add_index, sub_index)                      \
+  do {                                                                   \
+    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+                                &b##_lo[add_index], &b##_hi[add_index],  \
+                                &b##_lo[sub_index], &b##_hi[sub_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index,  \
+                          sub_index)                                           \
+  do {                                                                         \
+    butterfly_one_coeff_s32_fast(                                              \
+        a##_lo[left_index], a##_hi[left_index], a##_lo[right_index],           \
+        a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+        &b##_lo[sub_index], &b##_hi[sub_index]);                               \
+  } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
+                          right_constant, b, add_index, sub_index)             \
+  do {                                                                         \
+    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
+                            a##_lo[right_index], a##_hi[right_index],          \
+                            left_constant, right_constant, &b##_lo[add_index], \
+                            &b##_hi[add_index], &b##_lo[sub_index],            \
+                            &b##_hi[sub_index]);                               \
+  } while (0)
+
+static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+  int32x4_t c_lo[32];
+  int32x4_t c_hi[32];
+  int32x4_t d_lo[32];
+  int32x4_t d_hi[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+  b[18] = a[18];
+  b[19] = a[19];
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+  b[28] = a[28];
+  b[29] = a[29];
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 3. With extreme values for input this calculation rolls over int16_t.
+  // The sources for b[0] get added multiple times and, through testing, have
+  // been shown to overflow starting here.
+  ADD_S16_S32(b, 0, 7, c, 0);
+  ADD_S16_S32(b, 1, 6, c, 1);
+  ADD_S16_S32(b, 2, 5, c, 2);
+  ADD_S16_S32(b, 3, 4, c, 3);
+  SUB_S16_S32(b, 3, 4, c, 4);
+  SUB_S16_S32(b, 2, 5, c, 5);
+  SUB_S16_S32(b, 1, 6, c, 6);
+  SUB_S16_S32(b, 0, 7, c, 7);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  ADD_S16_S32(b, 16, 23, c, 16);
+  ADD_S16_S32(b, 17, 22, c, 17);
+  ADD_S16_S32(b, 18, 21, c, 18);
+  ADD_S16_S32(b, 19, 20, c, 19);
+  SUB_S16_S32(b, 19, 20, c, 20);
+  SUB_S16_S32(b, 18, 21, c, 21);
+  SUB_S16_S32(b, 17, 22, c, 22);
+  SUB_S16_S32(b, 16, 23, c, 23);
+  SUB_S16_S32(b, 31, 24, c, 24);
+  SUB_S16_S32(b, 30, 25, c, 25);
+  SUB_S16_S32(b, 29, 26, c, 26);
+  SUB_S16_S32(b, 28, 27, c, 27);
+  ADD_S16_S32(b, 28, 27, c, 28);
+  ADD_S16_S32(b, 29, 26, c, 29);
+  ADD_S16_S32(b, 30, 25, c, 30);
+  ADD_S16_S32(b, 31, 24, c, 31);
+
+  // Stage 4.
+  ADD_S32(c, 0, 3, d, 0);
+  ADD_S32(c, 1, 2, d, 1);
+  SUB_S32(c, 1, 2, d, 2);
+  SUB_S32(c, 0, 3, d, 3);
+
+  PASS_THROUGH(c, d, 4);
+
+  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+  PASS_THROUGH(c, d, 7);
+
+  ADDW_S16_S32(c, 11, a, 8, d, 8);
+  ADDW_S16_S32(c, 10, a, 9, d, 9);
+  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+  ADDW_S16_S32(c, 13, b, 14, d, 14);
+  ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 17);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+  PASS_THROUGH(c, d, 22);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 25);
+
+  PASS_THROUGH(c, d, 30);
+  PASS_THROUGH(c, d, 31);
+
+  // Stage 5.
+  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+  BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+  ADD_S32(d, 4, 5, c, 4);
+  SUB_S32(d, 4, 5, c, 5);
+  SUB_S32(d, 7, 6, c, 6);
+  ADD_S32(d, 7, 6, c, 7);
+
+  PASS_THROUGH(d, c, 8);
+
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+  PASS_THROUGH(d, c, 11);
+  PASS_THROUGH(d, c, 12);
+  PASS_THROUGH(d, c, 15);
+
+  ADD_S32(d, 16, 19, c, 16);
+  ADD_S32(d, 17, 18, c, 17);
+  SUB_S32(d, 17, 18, c, 18);
+  SUB_S32(d, 16, 19, c, 19);
+  SUB_S32(d, 23, 20, c, 20);
+  SUB_S32(d, 22, 21, c, 21);
+  ADD_S32(d, 22, 21, c, 22);
+  ADD_S32(d, 23, 20, c, 23);
+  ADD_S32(d, 24, 27, c, 24);
+  ADD_S32(d, 25, 26, c, 25);
+  SUB_S32(d, 25, 26, c, 26);
+  SUB_S32(d, 24, 27, c, 27);
+  SUB_S32(d, 31, 28, c, 28);
+  SUB_S32(d, 30, 29, c, 29);
+  ADD_S32(d, 30, 29, c, 30);
+  ADD_S32(d, 31, 28, c, 31);
+
+  // Stage 6.
+  PASS_THROUGH(c, d, 0);
+  PASS_THROUGH(c, d, 1);
+  PASS_THROUGH(c, d, 2);
+  PASS_THROUGH(c, d, 3);
+
+  BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+  BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+  ADD_S32(c, 8, 9, d, 8);
+  SUB_S32(c, 8, 9, d, 9);
+  SUB_S32(c, 11, 10, d, 10);
+  ADD_S32(c, 11, 10, d, 11);
+  ADD_S32(c, 12, 13, d, 12);
+  SUB_S32(c, 12, 13, d, 13);
+  SUB_S32(c, 15, 14, d, 14);
+  ADD_S32(c, 15, 14, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 19);
+  PASS_THROUGH(c, d, 20);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 27);
+  PASS_THROUGH(c, d, 28);
+  PASS_THROUGH(c, d, 31);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+  // Stage 7.
+  PASS_THROUGH(d, c, 0);
+  PASS_THROUGH(d, c, 1);
+  PASS_THROUGH(d, c, 2);
+  PASS_THROUGH(d, c, 3);
+  PASS_THROUGH(d, c, 4);
+  PASS_THROUGH(d, c, 5);
+  PASS_THROUGH(d, c, 6);
+  PASS_THROUGH(d, c, 7);
+
+  BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+  BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+  ADD_S32(d, 16, 17, c, 16);
+  SUB_S32(d, 16, 17, c, 17);
+  SUB_S32(d, 19, 18, c, 18);
+  ADD_S32(d, 19, 18, c, 19);
+  ADD_S32(d, 20, 21, c, 20);
+  SUB_S32(d, 20, 21, c, 21);
+  SUB_S32(d, 23, 22, c, 22);
+  ADD_S32(d, 23, 22, c, 23);
+  ADD_S32(d, 24, 25, c, 24);
+  SUB_S32(d, 24, 25, c, 25);
+  SUB_S32(d, 27, 26, c, 26);
+  ADD_S32(d, 27, 26, c, 27);
+  ADD_S32(d, 28, 29, c, 28);
+  SUB_S32(d, 28, 29, c, 29);
+  SUB_S32(d, 31, 30, c, 30);
+  ADD_S32(d, 31, 30, c, 31);
+
+  // Final stage.
+  // Roll rounding into this function so we can pass back int16x8.
+
+  out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+  out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+  out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+  out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+  out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+  out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+  out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+  out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+  out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+  out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+  out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+  out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+  out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+  out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+  out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+  out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+  out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+  out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+  out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+  out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+  out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+  out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+  out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+  out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+  out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+  out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+  out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+  out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+  out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+  out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+  BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+  out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+  out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
+                                           int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+  b[16] = add_round_shift_s16(a[16]);
+  b[17] = add_round_shift_s16(a[17]);
+  b[18] = add_round_shift_s16(a[18]);
+  b[19] = add_round_shift_s16(a[19]);
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+  b[20] = add_round_shift_s16(b[20]);
+  b[21] = add_round_shift_s16(b[21]);
+  b[22] = add_round_shift_s16(b[22]);
+  b[23] = add_round_shift_s16(b[23]);
+  b[24] = add_round_shift_s16(b[24]);
+  b[25] = add_round_shift_s16(b[25]);
+  b[26] = add_round_shift_s16(b[26]);
+  b[27] = add_round_shift_s16(b[27]);
+
+  b[28] = add_round_shift_s16(a[28]);
+  b[29] = add_round_shift_s16(a[29]);
+  b[30] = add_round_shift_s16(a[30]);
+  b[31] = add_round_shift_s16(a[31]);
+
+  // Stage 3.
+  a[0] = vaddq_s16(b[0], b[7]);
+  a[1] = vaddq_s16(b[1], b[6]);
+  a[2] = vaddq_s16(b[2], b[5]);
+  a[3] = vaddq_s16(b[3], b[4]);
+
+  a[4] = vsubq_s16(b[3], b[4]);
+  a[5] = vsubq_s16(b[2], b[5]);
+  a[6] = vsubq_s16(b[1], b[6]);
+  a[7] = vsubq_s16(b[0], b[7]);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+  butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[16], b[23]);
+  a[17] = vaddq_s16(b[17], b[22]);
+  a[18] = vaddq_s16(b[18], b[21]);
+  a[19] = vaddq_s16(b[19], b[20]);
+
+  a[20] = vsubq_s16(b[19], b[20]);
+  a[21] = vsubq_s16(b[18], b[21]);
+  a[22] = vsubq_s16(b[17], b[22]);
+  a[23] = vsubq_s16(b[16], b[23]);
+
+  a[24] = vsubq_s16(b[31], b[24]);
+  a[25] = vsubq_s16(b[30], b[25]);
+  a[26] = vsubq_s16(b[29], b[26]);
+  a[27] = vsubq_s16(b[28], b[27]);
+
+  a[28] = vaddq_s16(b[28], b[27]);
+  a[29] = vaddq_s16(b[29], b[26]);
+  a[30] = vaddq_s16(b[30], b[25]);
+  a[31] = vaddq_s16(b[31], b[24]);
+
+  // Stage 4.
+  b[0] = vaddq_s16(a[0], a[3]);
+  b[1] = vaddq_s16(a[1], a[2]);
+  b[2] = vsubq_s16(a[1], a[2]);
+  b[3] = vsubq_s16(a[0], a[3]);
+
+  b[4] = a[4];
+
+  butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+  b[7] = a[7];
+
+  b[8] = vaddq_s16(a[8], a[11]);
+  b[9] = vaddq_s16(a[9], a[10]);
+  b[10] = vsubq_s16(a[9], a[10]);
+  b[11] = vsubq_s16(a[8], a[11]);
+  b[12] = vsubq_s16(a[15], a[12]);
+  b[13] = vsubq_s16(a[14], a[13]);
+  b[14] = vaddq_s16(a[14], a[13]);
+  b[15] = vaddq_s16(a[15], a[12]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+
+  butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+  butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+  butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+  butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+  b[22] = a[22];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[25] = a[25];
+
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+  butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+  a[4] = vaddq_s16(b[4], b[5]);
+  a[5] = vsubq_s16(b[4], b[5]);
+  a[6] = vsubq_s16(b[7], b[6]);
+  a[7] = vaddq_s16(b[7], b[6]);
+
+  a[8] = b[8];
+
+  butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+  butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+  a[11] = b[11];
+  a[12] = b[12];
+
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[19], b[16]);
+  a[17] = vaddq_s16(b[18], b[17]);
+  a[18] = vsubq_s16(b[17], b[18]);
+  a[19] = vsubq_s16(b[16], b[19]);
+  a[20] = vsubq_s16(b[23], b[20]);
+  a[21] = vsubq_s16(b[22], b[21]);
+  a[22] = vaddq_s16(b[21], b[22]);
+  a[23] = vaddq_s16(b[20], b[23]);
+  a[24] = vaddq_s16(b[27], b[24]);
+  a[25] = vaddq_s16(b[26], b[25]);
+  a[26] = vsubq_s16(b[25], b[26]);
+  a[27] = vsubq_s16(b[24], b[27]);
+  a[28] = vsubq_s16(b[31], b[28]);
+  a[29] = vsubq_s16(b[30], b[29]);
+  a[30] = vaddq_s16(b[29], b[30]);
+  a[31] = vaddq_s16(b[28], b[31]);
+
+  // Stage 6.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+
+  butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+  butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+  b[8] = vaddq_s16(a[8], a[9]);
+  b[9] = vsubq_s16(a[8], a[9]);
+  b[10] = vsubq_s16(a[11], a[10]);
+  b[11] = vaddq_s16(a[11], a[10]);
+  b[12] = vaddq_s16(a[12], a[13]);
+  b[13] = vsubq_s16(a[12], a[13]);
+  b[14] = vsubq_s16(a[15], a[14]);
+  b[15] = vaddq_s16(a[15], a[14]);
+
+  b[16] = a[16];
+  b[19] = a[19];
+  b[20] = a[20];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[27] = a[27];
+  b[28] = a[28];
+  b[31] = a[31];
+
+  butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+  butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+  butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+  butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+  // Stage 7.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+
+  butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+  butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+  butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+  butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+  a[16] = vaddq_s16(b[16], b[17]);
+  a[17] = vsubq_s16(b[16], b[17]);
+  a[18] = vsubq_s16(b[19], b[18]);
+  a[19] = vaddq_s16(b[19], b[18]);
+  a[20] = vaddq_s16(b[20], b[21]);
+  a[21] = vsubq_s16(b[20], b[21]);
+  a[22] = vsubq_s16(b[23], b[22]);
+  a[23] = vaddq_s16(b[23], b[22]);
+  a[24] = vaddq_s16(b[24], b[25]);
+  a[25] = vsubq_s16(b[24], b[25]);
+  a[26] = vsubq_s16(b[27], b[26]);
+  a[27] = vaddq_s16(b[27], b[26]);
+  a[28] = vaddq_s16(b[28], b[29]);
+  a[29] = vsubq_s16(b[28], b[29]);
+  a[30] = vsubq_s16(b[31], b[30]);
+  a[31] = vaddq_s16(b[31], b[30]);
+
+  // Final stage.
+  out[0] = a[0];
+  out[16] = a[1];
+  out[8] = a[2];
+  out[24] = a[3];
+  out[4] = a[4];
+  out[20] = a[5];
+  out[12] = a[6];
+  out[28] = a[7];
+  out[2] = a[8];
+  out[18] = a[9];
+  out[10] = a[10];
+  out[26] = a[11];
+  out[6] = a[12];
+  out[22] = a[13];
+  out[14] = a[14];
+  out[30] = a[15];
+
+  butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+  butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+                      &out[15]);
+  butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+  butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+  butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+  butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+                      &out[11]);
+  butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+                      &out[19]);
+  butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Store 32 32x4 vectors, assuming stride == 32.
+static INLINE void store32x32_s32(
+    tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
+    const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
+    const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
+    const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
+  int i;
+  for (i = 0; i < 32; i++) {
+    vst1q_s32(a, l1[i]);
+    vst1q_s32(a + 4, r1[i]);
+    vst1q_s32(a + 8, l2[i]);
+    vst1q_s32(a + 12, r2[i]);
+    vst1q_s32(a + 16, l3[i]);
+    vst1q_s32(a + 20, r3[i]);
+    vst1q_s32(a + 24, l4[i]);
+    vst1q_s32(a + 28, r4[i]);
+    a += 32;
+  }
+}
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
+                                      int32x4_t *left /*[32]*/,
+                                      int32x4_t *right /* [32] */) {
+  left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+  left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+  left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+  left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+  left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+  left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+  left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+  left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+  left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+  left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
+  left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
+  left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
+  left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
+  left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
+  left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
+  left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
+  left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
+  left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
+  left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
+  left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
+  left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
+  left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
+  left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
+  left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
+  left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
+
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+  right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+  right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
+  right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
+  right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
+  right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
+  right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
+  right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
+  right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
+  right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
+  right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
+  right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
+  right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
+  right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
+  right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
+  right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
+  right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
+  right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
+                                      int32x4_t *a_right /*[32]*/,
+                                      int32x4_t *b_left /*[32]*/,
+                                      int32x4_t *b_right /*[32]*/) {
+  // Stage 1. Done as part of the load for the first pass.
+  b_left[0] = vaddq_s32(a_left[0], a_left[31]);
+  b_left[1] = vaddq_s32(a_left[1], a_left[30]);
+  b_left[2] = vaddq_s32(a_left[2], a_left[29]);
+  b_left[3] = vaddq_s32(a_left[3], a_left[28]);
+  b_left[4] = vaddq_s32(a_left[4], a_left[27]);
+  b_left[5] = vaddq_s32(a_left[5], a_left[26]);
+  b_left[6] = vaddq_s32(a_left[6], a_left[25]);
+  b_left[7] = vaddq_s32(a_left[7], a_left[24]);
+  b_left[8] = vaddq_s32(a_left[8], a_left[23]);
+  b_left[9] = vaddq_s32(a_left[9], a_left[22]);
+  b_left[10] = vaddq_s32(a_left[10], a_left[21]);
+  b_left[11] = vaddq_s32(a_left[11], a_left[20]);
+  b_left[12] = vaddq_s32(a_left[12], a_left[19]);
+  b_left[13] = vaddq_s32(a_left[13], a_left[18]);
+  b_left[14] = vaddq_s32(a_left[14], a_left[17]);
+  b_left[15] = vaddq_s32(a_left[15], a_left[16]);
+
+  b_right[0] = vaddq_s32(a_right[0], a_right[31]);
+  b_right[1] = vaddq_s32(a_right[1], a_right[30]);
+  b_right[2] = vaddq_s32(a_right[2], a_right[29]);
+  b_right[3] = vaddq_s32(a_right[3], a_right[28]);
+  b_right[4] = vaddq_s32(a_right[4], a_right[27]);
+  b_right[5] = vaddq_s32(a_right[5], a_right[26]);
+  b_right[6] = vaddq_s32(a_right[6], a_right[25]);
+  b_right[7] = vaddq_s32(a_right[7], a_right[24]);
+  b_right[8] = vaddq_s32(a_right[8], a_right[23]);
+  b_right[9] = vaddq_s32(a_right[9], a_right[22]);
+  b_right[10] = vaddq_s32(a_right[10], a_right[21]);
+  b_right[11] = vaddq_s32(a_right[11], a_right[20]);
+  b_right[12] = vaddq_s32(a_right[12], a_right[19]);
+  b_right[13] = vaddq_s32(a_right[13], a_right[18]);
+  b_right[14] = vaddq_s32(a_right[14], a_right[17]);
+  b_right[15] = vaddq_s32(a_right[15], a_right[16]);
+
+  b_left[16] = vsubq_s32(a_left[15], a_left[16]);
+  b_left[17] = vsubq_s32(a_left[14], a_left[17]);
+  b_left[18] = vsubq_s32(a_left[13], a_left[18]);
+  b_left[19] = vsubq_s32(a_left[12], a_left[19]);
+  b_left[20] = vsubq_s32(a_left[11], a_left[20]);
+  b_left[21] = vsubq_s32(a_left[10], a_left[21]);
+  b_left[22] = vsubq_s32(a_left[9], a_left[22]);
+  b_left[23] = vsubq_s32(a_left[8], a_left[23]);
+  b_left[24] = vsubq_s32(a_left[7], a_left[24]);
+  b_left[25] = vsubq_s32(a_left[6], a_left[25]);
+  b_left[26] = vsubq_s32(a_left[5], a_left[26]);
+  b_left[27] = vsubq_s32(a_left[4], a_left[27]);
+  b_left[28] = vsubq_s32(a_left[3], a_left[28]);
+  b_left[29] = vsubq_s32(a_left[2], a_left[29]);
+  b_left[30] = vsubq_s32(a_left[1], a_left[30]);
+  b_left[31] = vsubq_s32(a_left[0], a_left[31]);
+
+  b_right[16] = vsubq_s32(a_right[15], a_right[16]);
+  b_right[17] = vsubq_s32(a_right[14], a_right[17]);
+  b_right[18] = vsubq_s32(a_right[13], a_right[18]);
+  b_right[19] = vsubq_s32(a_right[12], a_right[19]);
+  b_right[20] = vsubq_s32(a_right[11], a_right[20]);
+  b_right[21] = vsubq_s32(a_right[10], a_right[21]);
+  b_right[22] = vsubq_s32(a_right[9], a_right[22]);
+  b_right[23] = vsubq_s32(a_right[8], a_right[23]);
+  b_right[24] = vsubq_s32(a_right[7], a_right[24]);
+  b_right[25] = vsubq_s32(a_right[6], a_right[25]);
+  b_right[26] = vsubq_s32(a_right[5], a_right[26]);
+  b_right[27] = vsubq_s32(a_right[4], a_right[27]);
+  b_right[28] = vsubq_s32(a_right[3], a_right[28]);
+  b_right[29] = vsubq_s32(a_right[2], a_right[29]);
+  b_right[30] = vsubq_s32(a_right[1], a_right[30]);
+  b_right[31] = vsubq_s32(a_right[0], a_right[31]);
+}
+
+static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
+                                                  int32x4_t *right /* [32] */) {
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+  left[0] = add_round_shift_s32(left[0]);
+  left[1] = add_round_shift_s32(left[1]);
+  left[2] = add_round_shift_s32(left[2]);
+  left[3] = add_round_shift_s32(left[3]);
+  left[4] = add_round_shift_s32(left[4]);
+  left[5] = add_round_shift_s32(left[5]);
+  left[6] = add_round_shift_s32(left[6]);
+  left[7] = add_round_shift_s32(left[7]);
+  left[8] = add_round_shift_s32(left[8]);
+  left[9] = add_round_shift_s32(left[9]);
+  left[10] = add_round_shift_s32(left[10]);
+  left[11] = add_round_shift_s32(left[11]);
+  left[12] = add_round_shift_s32(left[12]);
+  left[13] = add_round_shift_s32(left[13]);
+  left[14] = add_round_shift_s32(left[14]);
+  left[15] = add_round_shift_s32(left[15]);
+  left[16] = add_round_shift_s32(left[16]);
+  left[17] = add_round_shift_s32(left[17]);
+  left[18] = add_round_shift_s32(left[18]);
+  left[19] = add_round_shift_s32(left[19]);
+  left[20] = add_round_shift_s32(left[20]);
+  left[21] = add_round_shift_s32(left[21]);
+  left[22] = add_round_shift_s32(left[22]);
+  left[23] = add_round_shift_s32(left[23]);
+  left[24] = add_round_shift_s32(left[24]);
+  left[25] = add_round_shift_s32(left[25]);
+  left[26] = add_round_shift_s32(left[26]);
+  left[27] = add_round_shift_s32(left[27]);
+  left[28] = add_round_shift_s32(left[28]);
+  left[29] = add_round_shift_s32(left[29]);
+  left[30] = add_round_shift_s32(left[30]);
+  left[31] = add_round_shift_s32(left[31]);
+
+  right[0] = add_round_shift_s32(right[0]);
+  right[1] = add_round_shift_s32(right[1]);
+  right[2] = add_round_shift_s32(right[2]);
+  right[3] = add_round_shift_s32(right[3]);
+  right[4] = add_round_shift_s32(right[4]);
+  right[5] = add_round_shift_s32(right[5]);
+  right[6] = add_round_shift_s32(right[6]);
+  right[7] = add_round_shift_s32(right[7]);
+  right[8] = add_round_shift_s32(right[8]);
+  right[9] = add_round_shift_s32(right[9]);
+  right[10] = add_round_shift_s32(right[10]);
+  right[11] = add_round_shift_s32(right[11]);
+  right[12] = add_round_shift_s32(right[12]);
+  right[13] = add_round_shift_s32(right[13]);
+  right[14] = add_round_shift_s32(right[14]);
+  right[15] = add_round_shift_s32(right[15]);
+  right[16] = add_round_shift_s32(right[16]);
+  right[17] = add_round_shift_s32(right[17]);
+  right[18] = add_round_shift_s32(right[18]);
+  right[19] = add_round_shift_s32(right[19]);
+  right[20] = add_round_shift_s32(right[20]);
+  right[21] = add_round_shift_s32(right[21]);
+  right[22] = add_round_shift_s32(right[22]);
+  right[23] = add_round_shift_s32(right[23]);
+  right[24] = add_round_shift_s32(right[24]);
+  right[25] = add_round_shift_s32(right[25]);
+  right[26] = add_round_shift_s32(right[26]);
+  right[27] = add_round_shift_s32(right[27]);
+  right[28] = add_round_shift_s32(right[28]);
+  right[29] = add_round_shift_s32(right[29]);
+  right[30] = add_round_shift_s32(right[30]);
+  right[31] = add_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
+                                                  int32x4_t *right /* [32] */) {
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+  left[0] = sub_round_shift_s32(left[0]);
+  left[1] = sub_round_shift_s32(left[1]);
+  left[2] = sub_round_shift_s32(left[2]);
+  left[3] = sub_round_shift_s32(left[3]);
+  left[4] = sub_round_shift_s32(left[4]);
+  left[5] = sub_round_shift_s32(left[5]);
+  left[6] = sub_round_shift_s32(left[6]);
+  left[7] = sub_round_shift_s32(left[7]);
+  left[8] = sub_round_shift_s32(left[8]);
+  left[9] = sub_round_shift_s32(left[9]);
+  left[10] = sub_round_shift_s32(left[10]);
+  left[11] = sub_round_shift_s32(left[11]);
+  left[12] = sub_round_shift_s32(left[12]);
+  left[13] = sub_round_shift_s32(left[13]);
+  left[14] = sub_round_shift_s32(left[14]);
+  left[15] = sub_round_shift_s32(left[15]);
+  left[16] = sub_round_shift_s32(left[16]);
+  left[17] = sub_round_shift_s32(left[17]);
+  left[18] = sub_round_shift_s32(left[18]);
+  left[19] = sub_round_shift_s32(left[19]);
+  left[20] = sub_round_shift_s32(left[20]);
+  left[21] = sub_round_shift_s32(left[21]);
+  left[22] = sub_round_shift_s32(left[22]);
+  left[23] = sub_round_shift_s32(left[23]);
+  left[24] = sub_round_shift_s32(left[24]);
+  left[25] = sub_round_shift_s32(left[25]);
+  left[26] = sub_round_shift_s32(left[26]);
+  left[27] = sub_round_shift_s32(left[27]);
+  left[28] = sub_round_shift_s32(left[28]);
+  left[29] = sub_round_shift_s32(left[29]);
+  left[30] = sub_round_shift_s32(left[30]);
+  left[31] = sub_round_shift_s32(left[31]);
+
+  right[0] = sub_round_shift_s32(right[0]);
+  right[1] = sub_round_shift_s32(right[1]);
+  right[2] = sub_round_shift_s32(right[2]);
+  right[3] = sub_round_shift_s32(right[3]);
+  right[4] = sub_round_shift_s32(right[4]);
+  right[5] = sub_round_shift_s32(right[5]);
+  right[6] = sub_round_shift_s32(right[6]);
+  right[7] = sub_round_shift_s32(right[7]);
+  right[8] = sub_round_shift_s32(right[8]);
+  right[9] = sub_round_shift_s32(right[9]);
+  right[10] = sub_round_shift_s32(right[10]);
+  right[11] = sub_round_shift_s32(right[11]);
+  right[12] = sub_round_shift_s32(right[12]);
+  right[13] = sub_round_shift_s32(right[13]);
+  right[14] = sub_round_shift_s32(right[14]);
+  right[15] = sub_round_shift_s32(right[15]);
+  right[16] = sub_round_shift_s32(right[16]);
+  right[17] = sub_round_shift_s32(right[17]);
+  right[18] = sub_round_shift_s32(right[18]);
+  right[19] = sub_round_shift_s32(right[19]);
+  right[20] = sub_round_shift_s32(right[20]);
+  right[21] = sub_round_shift_s32(right[21]);
+  right[22] = sub_round_shift_s32(right[22]);
+  right[23] = sub_round_shift_s32(right[23]);
+  right[24] = sub_round_shift_s32(right[24]);
+  right[25] = sub_round_shift_s32(right[25]);
+  right[26] = sub_round_shift_s32(right[26]);
+  right[27] = sub_round_shift_s32(right[27]);
+  right[28] = sub_round_shift_s32(right[28]);
+  right[29] = sub_round_shift_s32(right[29]);
+  right[30] = sub_round_shift_s32(right[30]);
+  right[31] = sub_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
+                                                  int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  al[0] = vaddq_s32(left[0], left[15]);
+  ar[0] = vaddq_s32(right[0], right[15]);
+  al[1] = vaddq_s32(left[1], left[14]);
+  ar[1] = vaddq_s32(right[1], right[14]);
+  al[2] = vaddq_s32(left[2], left[13]);
+  ar[2] = vaddq_s32(right[2], right[13]);
+  al[3] = vaddq_s32(left[3], left[12]);
+  ar[3] = vaddq_s32(right[3], right[12]);
+  al[4] = vaddq_s32(left[4], left[11]);
+  ar[4] = vaddq_s32(right[4], right[11]);
+  al[5] = vaddq_s32(left[5], left[10]);
+  ar[5] = vaddq_s32(right[5], right[10]);
+  al[6] = vaddq_s32(left[6], left[9]);
+  ar[6] = vaddq_s32(right[6], right[9]);
+  al[7] = vaddq_s32(left[7], left[8]);
+  ar[7] = vaddq_s32(right[7], right[8]);
+
+  al[8] = vsubq_s32(left[7], left[8]);
+  ar[8] = vsubq_s32(right[7], right[8]);
+  al[9] = vsubq_s32(left[6], left[9]);
+  ar[9] = vsubq_s32(right[6], right[9]);
+  al[10] = vsubq_s32(left[5], left[10]);
+  ar[10] = vsubq_s32(right[5], right[10]);
+  al[11] = vsubq_s32(left[4], left[11]);
+  ar[11] = vsubq_s32(right[4], right[11]);
+  al[12] = vsubq_s32(left[3], left[12]);
+  ar[12] = vsubq_s32(right[3], right[12]);
+  al[13] = vsubq_s32(left[2], left[13]);
+  ar[13] = vsubq_s32(right[2], right[13]);
+  al[14] = vsubq_s32(left[1], left[14]);
+  ar[14] = vsubq_s32(right[1], right[14]);
+  al[15] = vsubq_s32(left[0], left[15]);
+  ar[15] = vsubq_s32(right[0], right[15]);
+
+  al[16] = left[16];
+  ar[16] = right[16];
+  al[17] = left[17];
+  ar[17] = right[17];
+  al[18] = left[18];
+  ar[18] = right[18];
+  al[19] = left[19];
+  ar[19] = right[19];
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[28] = left[28];
+  ar[28] = right[28];
+  al[29] = left[29];
+  ar[29] = right[29];
+  al[30] = left[30];
+  ar[30] = right[30];
+  al[31] = left[31];
+  ar[31] = right[31];
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(left[16], al[23]);
+  br[16] = vaddq_s32(right[16], ar[23]);
+  bl[17] = vaddq_s32(left[17], al[22]);
+  br[17] = vaddq_s32(right[17], ar[22]);
+  bl[18] = vaddq_s32(left[18], al[21]);
+  br[18] = vaddq_s32(right[18], ar[21]);
+  bl[19] = vaddq_s32(left[19], al[20]);
+  br[19] = vaddq_s32(right[19], ar[20]);
+
+  bl[20] = vsubq_s32(left[19], al[20]);
+  br[20] = vsubq_s32(right[19], ar[20]);
+  bl[21] = vsubq_s32(left[18], al[21]);
+  br[21] = vsubq_s32(right[18], ar[21]);
+  bl[22] = vsubq_s32(left[17], al[22]);
+  br[22] = vsubq_s32(right[17], ar[22]);
+  bl[23] = vsubq_s32(left[16], al[23]);
+  br[23] = vsubq_s32(right[16], ar[23]);
+
+  bl[24] = vsubq_s32(left[31], al[24]);
+  br[24] = vsubq_s32(right[31], ar[24]);
+  bl[25] = vsubq_s32(left[30], al[25]);
+  br[25] = vsubq_s32(right[30], ar[25]);
+  bl[26] = vsubq_s32(left[29], al[26]);
+  br[26] = vsubq_s32(right[29], ar[26]);
+  bl[27] = vsubq_s32(left[28], al[27]);
+  br[27] = vsubq_s32(right[28], ar[27]);
+
+  bl[28] = vaddq_s32(left[28], al[27]);
+  br[28] = vaddq_s32(right[28], ar[27]);
+  bl[29] = vaddq_s32(left[29], al[26]);
+  br[29] = vaddq_s32(right[29], ar[26]);
+  bl[30] = vaddq_s32(left[30], al[25]);
+  br[30] = vaddq_s32(right[30], ar[25]);
+  bl[31] = vaddq_s32(left[31], al[24]);
+  br[31] = vaddq_s32(right[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                                     cospi_24_64, &al[29], &ar[29], &al[18],
+                                     &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                                     cospi_24_64, &al[28], &ar[28], &al[19],
+                                     &ar[19]);
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+                                     cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+                                     &al[20], &ar[20]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+                                     cospi_24_64, &bl[2], &br[2], &bl[3],
+                                     &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+                                     cospi_24_64, &bl[14], &br[14], &bl[9],
+                                     &br[9]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+                                     &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+                                     cospi_28_64, &al[4], &ar[4], &al[7],
+                                     &ar[7]);
+  butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+                                     cospi_12_64, &al[5], &ar[5], &al[6],
+                                     &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                                     cospi_28_64, &al[30], &ar[30], &al[17],
+                                     &ar[17]);
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+                                     cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+                                     &al[18], &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_20_64, cospi_12_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_12_64, -cospi_20_64, &al[25],
+                                     &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+                                     cospi_30_64, &bl[8], &br[8], &bl[15],
+                                     &br[15]);
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                                     cospi_14_64, &bl[9], &br[9], &bl[14],
+                                     &br[14]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_10_64, cospi_22_64, &bl[10], &br[10],
+                                     &bl[13], &br[13]);
+  butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+                                     cospi_26_64, cospi_6_64, &bl[11], &br[11],
+                                     &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                                     cospi_31_64, &al[1], &ar[1], &al[31],
+                                     &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+                                     cospi_17_64, cospi_15_64, &al[17], &ar[17],
+                                     &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                                     cospi_23_64, &al[9], &ar[9], &al[23],
+                                     &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+                                     cospi_25_64, cospi_7_64, &al[25], &ar[25],
+                                     &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                                     cospi_27_64, &al[5], &ar[5], &al[27],
+                                     &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_21_64, cospi_11_64, &al[21], &ar[21],
+                                     &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_13_64, cospi_19_64, &al[13], &ar[13],
+                                     &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+                                     cospi_29_64, cospi_3_64, &al[29], &ar[29],
+                                     &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
+                                                   int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  al[0] = vaddq_s32(left[0], left[15]);
+  ar[0] = vaddq_s32(right[0], right[15]);
+  al[1] = vaddq_s32(left[1], left[14]);
+  ar[1] = vaddq_s32(right[1], right[14]);
+  al[2] = vaddq_s32(left[2], left[13]);
+  ar[2] = vaddq_s32(right[2], right[13]);
+  al[3] = vaddq_s32(left[3], left[12]);
+  ar[3] = vaddq_s32(right[3], right[12]);
+  al[4] = vaddq_s32(left[4], left[11]);
+  ar[4] = vaddq_s32(right[4], right[11]);
+  al[5] = vaddq_s32(left[5], left[10]);
+  ar[5] = vaddq_s32(right[5], right[10]);
+  al[6] = vaddq_s32(left[6], left[9]);
+  ar[6] = vaddq_s32(right[6], right[9]);
+  al[7] = vaddq_s32(left[7], left[8]);
+  ar[7] = vaddq_s32(right[7], right[8]);
+
+  al[8] = vsubq_s32(left[7], left[8]);
+  ar[8] = vsubq_s32(right[7], right[8]);
+  al[9] = vsubq_s32(left[6], left[9]);
+  ar[9] = vsubq_s32(right[6], right[9]);
+  al[10] = vsubq_s32(left[5], left[10]);
+  ar[10] = vsubq_s32(right[5], right[10]);
+  al[11] = vsubq_s32(left[4], left[11]);
+  ar[11] = vsubq_s32(right[4], right[11]);
+  al[12] = vsubq_s32(left[3], left[12]);
+  ar[12] = vsubq_s32(right[3], right[12]);
+  al[13] = vsubq_s32(left[2], left[13]);
+  ar[13] = vsubq_s32(right[2], right[13]);
+  al[14] = vsubq_s32(left[1], left[14]);
+  ar[14] = vsubq_s32(right[1], right[14]);
+  al[15] = vsubq_s32(left[0], left[15]);
+  ar[15] = vsubq_s32(right[0], right[15]);
+
+  al[16] = left[16];
+  ar[16] = right[16];
+  al[17] = left[17];
+  ar[17] = right[17];
+  al[18] = left[18];
+  ar[18] = right[18];
+  al[19] = left[19];
+  ar[19] = right[19];
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[28] = left[28];
+  ar[28] = right[28];
+  al[29] = left[29];
+  ar[29] = right[29];
+  al[30] = left[30];
+  ar[30] = right[30];
+  al[31] = left[31];
+  ar[31] = right[31];
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(left[16], al[23]);
+  br[16] = vaddq_s32(right[16], ar[23]);
+  bl[17] = vaddq_s32(left[17], al[22]);
+  br[17] = vaddq_s32(right[17], ar[22]);
+  bl[18] = vaddq_s32(left[18], al[21]);
+  br[18] = vaddq_s32(right[18], ar[21]);
+  bl[19] = vaddq_s32(left[19], al[20]);
+  br[19] = vaddq_s32(right[19], ar[20]);
+
+  bl[20] = vsubq_s32(left[19], al[20]);
+  br[20] = vsubq_s32(right[19], ar[20]);
+  bl[21] = vsubq_s32(left[18], al[21]);
+  br[21] = vsubq_s32(right[18], ar[21]);
+  bl[22] = vsubq_s32(left[17], al[22]);
+  br[22] = vsubq_s32(right[17], ar[22]);
+  bl[23] = vsubq_s32(left[16], al[23]);
+  br[23] = vsubq_s32(right[16], ar[23]);
+
+  bl[24] = vsubq_s32(left[31], al[24]);
+  br[24] = vsubq_s32(right[31], ar[24]);
+  bl[25] = vsubq_s32(left[30], al[25]);
+  br[25] = vsubq_s32(right[30], ar[25]);
+  bl[26] = vsubq_s32(left[29], al[26]);
+  br[26] = vsubq_s32(right[29], ar[26]);
+  bl[27] = vsubq_s32(left[28], al[27]);
+  br[27] = vsubq_s32(right[28], ar[27]);
+
+  bl[28] = vaddq_s32(left[28], al[27]);
+  br[28] = vaddq_s32(right[28], ar[27]);
+  bl[29] = vaddq_s32(left[29], al[26]);
+  br[29] = vaddq_s32(right[29], ar[26]);
+  bl[30] = vaddq_s32(left[30], al[25]);
+  br[30] = vaddq_s32(right[30], ar[25]);
+  bl[31] = vaddq_s32(left[31], al[24]);
+  br[31] = vaddq_s32(right[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                                     cospi_24_64, &al[29], &ar[29], &al[18],
+                                     &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                                     cospi_24_64, &al[28], &ar[28], &al[19],
+                                     &ar[19]);
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+                                     cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+                                     &al[20], &ar[20]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+                                     cospi_24_64, &bl[2], &br[2], &bl[3],
+                                     &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+                                     cospi_24_64, &bl[14], &br[14], &bl[9],
+                                     &br[9]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+                                     &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+                                     cospi_28_64, &al[4], &ar[4], &al[7],
+                                     &ar[7]);
+  butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+                                     cospi_12_64, &al[5], &ar[5], &al[6],
+                                     &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                                     cospi_28_64, &al[30], &ar[30], &al[17],
+                                     &ar[17]);
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+                                     cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+                                     &al[18], &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_20_64, cospi_12_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_12_64, -cospi_20_64, &al[25],
+                                     &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+                                     cospi_30_64, &bl[8], &br[8], &bl[15],
+                                     &br[15]);
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                                     cospi_14_64, &bl[9], &br[9], &bl[14],
+                                     &br[14]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_10_64, cospi_22_64, &bl[10], &br[10],
+                                     &bl[13], &br[13]);
+  butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+                                     cospi_26_64, cospi_6_64, &bl[11], &br[11],
+                                     &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                                     cospi_31_64, &al[1], &ar[1], &al[31],
+                                     &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+                                     cospi_17_64, cospi_15_64, &al[17], &ar[17],
+                                     &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                                     cospi_23_64, &al[9], &ar[9], &al[23],
+                                     &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+                                     cospi_25_64, cospi_7_64, &al[25], &ar[25],
+                                     &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                                     cospi_27_64, &al[5], &ar[5], &al[27],
+                                     &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_21_64, cospi_11_64, &al[21], &ar[21],
+                                     &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_13_64, cospi_19_64, &al[13], &ar[13],
+                                     &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+                                     cospi_29_64, cospi_3_64, &al[29], &ar[29],
+                                     &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
+                                                      int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
+  ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
+  al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
+  ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
+  al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
+  ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
+  al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
+  ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
+  al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
+  ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
+  al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
+  ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
+  al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
+  ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
+  al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
+  ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
+
+  al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
+  ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
+  al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
+  ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
+  al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
+  ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
+  al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
+  ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
+  al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
+  ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
+  al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
+  ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
+  al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
+  ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
+  al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
+  ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
+
+  al[16] = add_round_shift_s32(left[16]);
+  ar[16] = add_round_shift_s32(right[16]);
+  al[17] = add_round_shift_s32(left[17]);
+  ar[17] = add_round_shift_s32(right[17]);
+  al[18] = add_round_shift_s32(left[18]);
+  ar[18] = add_round_shift_s32(right[18]);
+  al[19] = add_round_shift_s32(left[19]);
+  ar[19] = add_round_shift_s32(right[19]);
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[20] = add_round_shift_s32(al[20]);
+  ar[20] = add_round_shift_s32(ar[20]);
+  al[21] = add_round_shift_s32(al[21]);
+  ar[21] = add_round_shift_s32(ar[21]);
+  al[22] = add_round_shift_s32(al[22]);
+  ar[22] = add_round_shift_s32(ar[22]);
+  al[23] = add_round_shift_s32(al[23]);
+  ar[23] = add_round_shift_s32(ar[23]);
+  al[24] = add_round_shift_s32(al[24]);
+  ar[24] = add_round_shift_s32(ar[24]);
+  al[25] = add_round_shift_s32(al[25]);
+  ar[25] = add_round_shift_s32(ar[25]);
+  al[26] = add_round_shift_s32(al[26]);
+  ar[26] = add_round_shift_s32(ar[26]);
+  al[27] = add_round_shift_s32(al[27]);
+  ar[27] = add_round_shift_s32(ar[27]);
+
+  al[28] = add_round_shift_s32(left[28]);
+  ar[28] = add_round_shift_s32(right[28]);
+  al[29] = add_round_shift_s32(left[29]);
+  ar[29] = add_round_shift_s32(right[29]);
+  al[30] = add_round_shift_s32(left[30]);
+  ar[30] = add_round_shift_s32(right[30]);
+  al[31] = add_round_shift_s32(left[31]);
+  ar[31] = add_round_shift_s32(right[31]);
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[16], al[23]);
+  br[16] = vaddq_s32(ar[16], ar[23]);
+  bl[17] = vaddq_s32(al[17], al[22]);
+  br[17] = vaddq_s32(ar[17], ar[22]);
+  bl[18] = vaddq_s32(al[18], al[21]);
+  br[18] = vaddq_s32(ar[18], ar[21]);
+  bl[19] = vaddq_s32(al[19], al[20]);
+  br[19] = vaddq_s32(ar[19], ar[20]);
+
+  bl[20] = vsubq_s32(al[19], al[20]);
+  br[20] = vsubq_s32(ar[19], ar[20]);
+  bl[21] = vsubq_s32(al[18], al[21]);
+  br[21] = vsubq_s32(ar[18], ar[21]);
+  bl[22] = vsubq_s32(al[17], al[22]);
+  br[22] = vsubq_s32(ar[17], ar[22]);
+  bl[23] = vsubq_s32(al[16], al[23]);
+  br[23] = vsubq_s32(ar[16], ar[23]);
+
+  bl[24] = vsubq_s32(al[31], al[24]);
+  br[24] = vsubq_s32(ar[31], ar[24]);
+  bl[25] = vsubq_s32(al[30], al[25]);
+  br[25] = vsubq_s32(ar[30], ar[25]);
+  bl[26] = vsubq_s32(al[29], al[26]);
+  br[26] = vsubq_s32(ar[29], ar[26]);
+  bl[27] = vsubq_s32(al[28], al[27]);
+  br[27] = vsubq_s32(ar[28], ar[27]);
+
+  bl[28] = vaddq_s32(al[28], al[27]);
+  br[28] = vaddq_s32(ar[28], ar[27]);
+  bl[29] = vaddq_s32(al[29], al[26]);
+  br[29] = vaddq_s32(ar[29], ar[26]);
+  bl[30] = vaddq_s32(al[30], al[25]);
+  br[30] = vaddq_s32(ar[30], ar[25]);
+  bl[31] = vaddq_s32(al[31], al[24]);
+  br[31] = vaddq_s32(ar[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                          cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
+  butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                          cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
+  butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
+                          -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
+                          -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
+                          &bl[2], &br[2], &bl[3], &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
+                          &bl[14], &br[14], &bl[9], &br[9]);
+  butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
+                          -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
+                          &al[4], &ar[4], &al[7], &ar[7]);
+  butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
+                          &al[5], &ar[5], &al[6], &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                          cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
+                          -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
+                          cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
+                          -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
+                          &bl[8], &br[8], &bl[15], &br[15]);
+  butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                          cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
+  butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
+                          cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
+  butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
+                          cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                          cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
+                          cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                          cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
+                          cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                          cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
+                          cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
+                          cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
+                          cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c
new file mode 100644
index 0000000000..4bc968ecba
--- /dev/null
+++ b/vpx_dsp/arm/fdct4x4_neon.c
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  // input[M * stride] * 16
+  int16x4_t in[4];
+  in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+    in[0] = vadd_s16(in[0], one);
+  }
+  vpx_fdct4x4_pass1_neon(in);
+  vpx_fdct4x4_pass2_neon(in);
+  {
+    // Not quite a rounding shift. Only add 1 despite shifting by 2.
+    const int16x8_t one = vdupq_n_s16(1);
+    int16x8_t out_01 = vcombine_s16(in[0], in[1]);
+    int16x8_t out_23 = vcombine_s16(in[2], in[3]);
+    out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+    out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+    store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+    store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                             int stride) {
+  const int32x4_t const_one = vdupq_n_s32(1);
+
+  // input[M * stride] * 16
+  int32x4_t in[4];
+  in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    static const int32_t k1000[4] = { 1, 0, 0, 0 };
+    in[0] = vaddq_s32(in[0], vld1q_s32(k1000));
+  }
+
+  vpx_highbd_fdct4x4_pass1_neon(in);
+  vpx_highbd_fdct4x4_pass1_neon(in);
+  {
+    // Not quite a rounding shift. Only add 1 despite shifting by 2.
+    in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
+    in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2);
+    in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2);
+    in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2);
+
+    vst1q_s32(final_output, in[0]);
+    vst1q_s32(final_output + 4, in[1]);
+    vst1q_s32(final_output + 8, in[2]);
+    vst1q_s32(final_output + 12, in[3]);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct4x4_neon.h b/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 0000000000..de3db9774c
--- /dev/null
+++ b/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+                                               &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+  int32x4_t out[4];
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+  const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+  const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+  const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+  butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+                                          &out[1], &out[3]);
+
+  transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c
new file mode 100644
index 0000000000..75ee6f2230
--- /dev/null
+++ b/vpx_dsp/arm/fdct8x8_neon.c
@@ -0,0 +1,143 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  // stage 1
+  int16x8_t in[8];
+  in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+
+  vpx_fdct8x8_pass1_neon(in);
+  vpx_fdct8x8_pass2_neon(in);
+  {
+    // from vpx_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+    in[0] = vhsubq_s16(in[0], sign_in0);
+    in[1] = vhsubq_s16(in[1], sign_in1);
+    in[2] = vhsubq_s16(in[2], sign_in2);
+    in[3] = vhsubq_s16(in[3], sign_in3);
+    in[4] = vhsubq_s16(in[4], sign_in4);
+    in[5] = vhsubq_s16(in[5], sign_in5);
+    in[6] = vhsubq_s16(in[6], sign_in6);
+    in[7] = vhsubq_s16(in[7], sign_in7);
+    // store results
+    store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+    store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+    store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+    store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+    store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+    store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+    store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+    store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+                             int stride) {
+  // input[M * stride] * 16
+  int32x4_t left[8], right[8];
+  int16x8_t in[8];
+  in[0] = vld1q_s16(input + 0 * stride);
+  in[1] = vld1q_s16(input + 1 * stride);
+  in[2] = vld1q_s16(input + 2 * stride);
+  in[3] = vld1q_s16(input + 3 * stride);
+  in[4] = vld1q_s16(input + 4 * stride);
+  in[5] = vld1q_s16(input + 5 * stride);
+  in[6] = vld1q_s16(input + 6 * stride);
+  in[7] = vld1q_s16(input + 7 * stride);
+
+  left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+  right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+  vpx_highbd_fdct8x8_pass1_neon(left, right);
+  vpx_highbd_fdct8x8_pass2_neon(left, right);
+  {
+    left[0] = add_round_shift_half_s32(left[0]);
+    left[1] = add_round_shift_half_s32(left[1]);
+    left[2] = add_round_shift_half_s32(left[2]);
+    left[3] = add_round_shift_half_s32(left[3]);
+    left[4] = add_round_shift_half_s32(left[4]);
+    left[5] = add_round_shift_half_s32(left[5]);
+    left[6] = add_round_shift_half_s32(left[6]);
+    left[7] = add_round_shift_half_s32(left[7]);
+    right[0] = add_round_shift_half_s32(right[0]);
+    right[1] = add_round_shift_half_s32(right[1]);
+    right[2] = add_round_shift_half_s32(right[2]);
+    right[3] = add_round_shift_half_s32(right[3]);
+    right[4] = add_round_shift_half_s32(right[4]);
+    right[5] = add_round_shift_half_s32(right[5]);
+    right[6] = add_round_shift_half_s32(right[6]);
+    right[7] = add_round_shift_half_s32(right[7]);
+
+    // store results
+    vst1q_s32(final_output, left[0]);
+    vst1q_s32(final_output + 4, right[0]);
+    vst1q_s32(final_output + 8, left[1]);
+    vst1q_s32(final_output + 12, right[1]);
+    vst1q_s32(final_output + 16, left[2]);
+    vst1q_s32(final_output + 20, right[2]);
+    vst1q_s32(final_output + 24, left[3]);
+    vst1q_s32(final_output + 28, right[3]);
+    vst1q_s32(final_output + 32, left[4]);
+    vst1q_s32(final_output + 36, right[4]);
+    vst1q_s32(final_output + 40, left[5]);
+    vst1q_s32(final_output + 44, right[5]);
+    vst1q_s32(final_output + 48, left[6]);
+    vst1q_s32(final_output + 52, right[6]);
+    vst1q_s32(final_output + 56, left[7]);
+    vst1q_s32(final_output + 60, right[7]);
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 0000000000..cc65157430
--- /dev/null
+++ b/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,307 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+                                          &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass1_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass2_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+                          &left[2], &right[2], &left[6], &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+                          &left[1], &right[1], &left[7], &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+                          &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[2], &right[2], &left[6],
+                                     &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[1], &right[1], &left[7],
+                                     &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[5], &right[5], &left[3],
+                                     &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+  transpose_s32_8x8_2(left, right, left, right);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+  transpose_s32_8x8_2(left, right, left, right);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c
deleted file mode 100644
index 3708cbb11f..0000000000
--- a/vpx_dsp/arm/fdct_neon.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
-#include "vpx_dsp/arm/transpose_neon.h"
-
-void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
-                      int stride) {
-  int i;
-  // input[M * stride] * 16
-  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
-  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
-  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
-  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
-
-  // If the very first value != 0, then add 1.
-  if (input[0] != 0) {
-    const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
-    input_0 = vadd_s16(input_0, one);
-  }
-
-  for (i = 0; i < 2; ++i) {
-    const int16x8_t input_01 = vcombine_s16(input_0, input_1);
-    const int16x8_t input_32 = vcombine_s16(input_3, input_2);
-
-    // in_0 +/- in_3, in_1 +/- in_2
-    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
-    const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
-    // step_0 +/- step_1, step_2 +/- step_3
-    const int16x4_t s_0 = vget_low_s16(s_01);
-    const int16x4_t s_1 = vget_high_s16(s_01);
-    const int16x4_t s_2 = vget_high_s16(s_32);
-    const int16x4_t s_3 = vget_low_s16(s_32);
-
-    // (s_0 +/- s_1) * cospi_16_64
-    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
-    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
-    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
-    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
-    // fdct_round_shift
-    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
-    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
-    // s_3 * cospi_8_64 + s_2 * cospi_24_64
-    // s_3 * cospi_24_64 - s_2 * cospi_8_64
-    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
-    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
-    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
-    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
-    // fdct_round_shift
-    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
-    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
-    transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
-    input_0 = out_0;
-    input_1 = out_1;
-    input_2 = out_2;
-    input_3 = out_3;
-  }
-
-  {
-    // Not quite a rounding shift. Only add 1 despite shifting by 2.
-    const int16x8_t one = vdupq_n_s16(1);
-    int16x8_t out_01 = vcombine_s16(input_0, input_1);
-    int16x8_t out_23 = vcombine_s16(input_2, input_3);
-    out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
-    out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
-    store_s16q_to_tran_low(final_output + 0 * 8, out_01);
-    store_s16q_to_tran_low(final_output + 1 * 8, out_23);
-  }
-}
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
new file mode 100644
index 0000000000..16f5c5fc0e
--- /dev/null
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -0,0 +1,542 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT_NEON_H_
+
+#include <arm_neon.h>
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+                                                     const int16x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int16x4_t *add,
+                                                     int16x4_t *sub) {
+  int16x4_t c = vdup_n_s16(2 * constant);
+  *add = vqrdmulh_s16(vadd_s16(a, b), c);
+  *sub = vqrdmulh_s16(vsub_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+                                                const int16x8_t b,
+                                                const tran_coef_t constant,
+                                                int16x8_t *add,
+                                                int16x8_t *sub) {
+  int16x8_t c = vdupq_n_s16(2 * constant);
+  *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+  *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  const int16x4_t a_lo = vget_low_s16(a);
+  const int16x4_t a_hi = vget_high_s16(a);
+  const int16x4_t b_lo = vget_low_s16(b);
+  const int16x4_t b_hi = vget_high_s16(b);
+  *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+  butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+                                   &sub_hi);
+  *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+  *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int32x4_t *add, int32x4_t *sub) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+  *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int16x4_t *add, int16x4_t *sub) {
+  int32x4_t add32, sub32;
+  butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+  *add = vmovn_s32(add32);
+  *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+  butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+                              &sub32_hi);
+  *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+  *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant);
+  *add_lo = vmlaq_n_s32(a1, b_lo, constant);
+  *add_hi = vmlaq_n_s32(a2, b_hi, constant);
+  *sub_lo = vmlsq_n_s32(a3, b_lo, constant);
+  *sub_hi = vmlsq_n_s32(a4, b_hi, constant);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+                                                     const int32x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int32x4_t *add,
+                                                     int32x4_t *sub) {
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+  *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_one_coeff_s32_s64_narrow(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // ac holds the following values:
+  // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c,
+  //     vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c
+  int64x2_t ac[4];
+  int64x2_t sum[4];
+  int64x2_t diff[4];
+
+  ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant);
+  ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant);
+  ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant);
+  ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant);
+
+  sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant);
+  sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant);
+  sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant);
+  sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant);
+  *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+  *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+  diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant);
+  diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant);
+  diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant);
+  diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant);
+  *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+  *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+    const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+  const int32x2_t a_lo = vget_low_s32(a);
+  const int32x2_t a_hi = vget_high_s32(a);
+  const int32x2_t b_lo = vget_low_s32(b);
+  const int32x2_t b_hi = vget_high_s32(b);
+
+  const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+  const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+  const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+  const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+  const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+  const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+  const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+  const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 64-bit values
+// returns results without rounding
+static INLINE void butterfly_two_coeff_s32_s64_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/,
+    int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/,
+    int64x2_t *sub_hi /*[2]*/) {
+  // ac1/ac2 hold the following values:
+  // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+  //      vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+  // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+  //      vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+  int64x2_t ac1[4];
+  int64x2_t ac2[4];
+
+  ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+  ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+  ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+  ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+  ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+  ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+  ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+  ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+  add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+  add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+  add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+  add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+
+  sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+  sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+  sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+  sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // ac1/ac2 hold the following values:
+  // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+  //      vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+  // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+  //      vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+  int64x2_t ac1[4];
+  int64x2_t ac2[4];
+  int64x2_t sum[4];
+  int64x2_t diff[4];
+
+  ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+  ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+  ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+  ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+  ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+  ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+  ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+  ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+  sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+  sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+  sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+  sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+  *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+  *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+  diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+  diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+  diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+  diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+  *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+  *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s16_s32_noround(
+    const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo,
+    const int16x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmull_n_s16(a_lo, constant1);
+  const int32x4_t a2 = vmull_n_s16(a_hi, constant1);
+  const int32x4_t a3 = vmull_n_s16(a_lo, constant2);
+  const int32x4_t a4 = vmull_n_s16(a_hi, constant2);
+  *add_lo = vmlal_n_s16(a1, b_lo, constant2);
+  *add_hi = vmlal_n_s16(a2, b_hi, constant2);
+  *sub_lo = vmlsl_n_s16(a3, b_lo, constant1);
+  *sub_hi = vmlsl_n_s16(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+  *add_lo = vmlaq_n_s32(a1, b_lo, constant2);
+  *add_hi = vmlaq_n_s32(a2, b_hi, constant2);
+  *sub_lo = vmlsq_n_s32(a3, b_lo, constant1);
+  *sub_hi = vmlsq_n_s32(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+                                            const int16x4_t b,
+                                            const tran_coef_t constant1,
+                                            const tran_coef_t constant2,
+                                            int16x4_t *add, int16x4_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(a, constant1);
+  const int32x4_t a2 = vmull_n_s16(a, constant2);
+  const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+  const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+  *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+  *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t constant1,
+                                       const tran_coef_t constant2,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+  const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+  const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+  const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+  const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+  const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+  const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+  const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+  const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+  const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+  const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+  const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+                                                   const int32x4_t a_hi) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+  const int16x4_t b_lo =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+  const int16x4_t b_hi =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+  return vcombine_s16(b_lo, b_hi);
+}
+
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
+}
+
+static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/,
+                                             const int64x2_t *b /*[2]*/) {
+  int64x2_t result[2];
+  result[0] = vaddq_s64(a[0], b[0]);
+  result[1] = vaddq_s64(a[1], b[1]);
+  return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+                      vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/,
+                                             const int64x2_t *b /*[2]*/) {
+  int64x2_t result[2];
+  result[0] = vsubq_s64(a[0], b[0]);
+  result[1] = vsubq_s64(a[1], b[1]);
+  return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+                      vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a,
+                                           const int32x4_t b) {
+  int64x2_t a64[2], b64[2], result[2];
+  a64[0] = vmovl_s32(vget_low_s32(a));
+  a64[1] = vmovl_s32(vget_high_s32(a));
+  b64[0] = vmovl_s32(vget_low_s32(b));
+  b64[1] = vmovl_s32(vget_high_s32(b));
+  result[0] = vaddq_s64(a64[0], b64[0]);
+  result[1] = vaddq_s64(a64[1], b64[1]);
+  return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a,
+                                           const int32x4_t b) {
+  int64x2_t a64[2], b64[2], result[2];
+  a64[0] = vmovl_s32(vget_low_s32(a));
+  a64[1] = vmovl_s32(vget_high_s32(a));
+  b64[0] = vmovl_s32(vget_low_s32(b));
+  b64[1] = vmovl_s32(vget_high_s32(b));
+  result[0] = vsubq_s64(a64[0], b64[0]);
+  result[1] = vsubq_s64(a64[1], b64[1]);
+  return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+#endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c
index e73de41d77..df0da543ce 100644
--- a/vpx_dsp/arm/fdct_partial_neon.c
+++ b/vpx_dsp/arm/fdct_partial_neon.c
@@ -15,19 +15,10 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE tran_low_t get_lane(const int32x2_t a) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return vget_lane_s32(a, 0);
-#else
-  return vget_lane_s16(vreinterpret_s16_s32(a), 0);
-#endif  // CONFIG_VP9_HIGHBITDETPH
-}
-
 void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x4_t a0, a1, a2, a3;
   int16x8_t b0, b1;
   int16x8_t c;
-  int32x2_t d;
 
   a0 = vld1_s16(input);
   input += stride;
@@ -42,12 +33,19 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   c = vaddq_s16(b0, b1);
 
-  d = horizontal_add_int16x8(c);
-
-  output[0] = get_lane(vshl_n_s32(d, 1));
+  output[0] = (tran_low_t)(horizontal_add_int16x8(c) << 1);
   output[1] = 0;
 }
 
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// will fail with an internal compiler error.
+// See:
+// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", off)
+#endif
 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   int r;
   int16x8_t sum = vld1q_s16(&input[0]);
@@ -57,16 +55,19 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
     sum = vaddq_s16(sum, input_00);
   }
 
-  output[0] = get_lane(horizontal_add_int16x8(sum));
+  output[0] = (tran_low_t)horizontal_add_int16x8(sum);
   output[1] = 0;
 }
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", on)
+#endif
 
 void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
                           int stride) {
   int r;
   int16x8_t left = vld1q_s16(input);
   int16x8_t right = vld1q_s16(input + 8);
-  int32x2_t sum;
+  int32_t sum;
   input += stride;
 
   for (r = 1; r < 16; ++r) {
@@ -77,9 +78,9 @@ void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
     right = vaddq_s16(right, b);
   }
 
-  sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
+  sum = horizontal_add_int16x8(left) + horizontal_add_int16x8(right);
 
-  output[0] = get_lane(vshr_n_s32(sum, 1));
+  output[0] = (tran_low_t)(sum >> 1);
   output[1] = 0;
 }
 
@@ -90,7 +91,7 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
   int16x8_t a1 = vld1q_s16(input + 8);
   int16x8_t a2 = vld1q_s16(input + 16);
   int16x8_t a3 = vld1q_s16(input + 24);
-  int32x2_t sum;
+  int32_t sum;
   input += stride;
 
   for (r = 1; r < 32; ++r) {
@@ -105,9 +106,75 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
     a3 = vaddq_s16(a3, b3);
   }
 
-  sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
-  sum = vadd_s32(sum, horizontal_add_int16x8(a2));
-  sum = vadd_s32(sum, horizontal_add_int16x8(a3));
-  output[0] = get_lane(vshr_n_s32(sum, 3));
+  sum = horizontal_add_int16x8(a0);
+  sum += horizontal_add_int16x8(a1);
+  sum += horizontal_add_int16x8(a2);
+  sum += horizontal_add_int16x8(a3);
+  output[0] = (tran_low_t)(sum >> 3);
+  output[1] = 0;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+                                 int stride) {
+  int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                               vdupq_n_s32(0) };
+  int32_t sum;
+
+  int r = 0;
+  do {
+    const int16x8_t a = vld1q_s16(input);
+    const int16x8_t b = vld1q_s16(input + 8);
+    input += stride;
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b));
+    r++;
+  } while (r < 16);
+
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+  partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+  sum = horizontal_add_int32x4(partial_sum[0]);
+
+  output[0] = (tran_low_t)(sum >> 1);
+  output[1] = 0;
+}
+
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+                                 int stride) {
+  int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                               vdupq_n_s32(0) };
+
+  int32_t sum;
+
+  int r = 0;
+  do {
+    const int16x8_t a0 = vld1q_s16(input);
+    const int16x8_t a1 = vld1q_s16(input + 8);
+    const int16x8_t a2 = vld1q_s16(input + 16);
+    const int16x8_t a3 = vld1q_s16(input + 24);
+    input += stride;
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0));
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3));
+    r++;
+  } while (r < 32);
+
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+  partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+  sum = horizontal_add_int32x4(partial_sum[0]);
+
+  output[0] = (tran_low_t)(sum >> 3);
   output[1] = 0;
 }
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
deleted file mode 100644
index 374a262b93..0000000000
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
-
-void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
-                      int stride) {
-  int i;
-  // stage 1
-  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
-  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
-  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
-  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
-  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
-  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
-  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
-  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
-  for (i = 0; i < 2; ++i) {
-    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
-    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
-    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
-    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
-    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
-    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
-    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
-    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
-    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
-    // fdct4(step, step);
-    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-    // fdct4(step, step);
-    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-    }
-    // Stage 2
-    v_x0 = vsubq_s16(v_s6, v_s5);
-    v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x8_t ab = vcombine_s16(a, b);
-      const int16x8_t cd = vcombine_s16(c, d);
-      // Stage 3
-      v_x0 = vaddq_s16(v_s4, ab);
-      v_x1 = vsubq_s16(v_s4, ab);
-      v_x2 = vsubq_s16(v_s7, cd);
-      v_x3 = vaddq_s16(v_s7, cd);
-    }
-    // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-    }
-    // transpose 8x8
-    // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
-    // columns.
-    {
-      // 00 01 02 03 40 41 42 43
-      // 10 11 12 13 50 51 52 53
-      // 20 21 22 23 60 61 62 63
-      // 30 31 32 33 70 71 72 73
-      // 04 05 06 07 44 45 46 47
-      // 14 15 16 17 54 55 56 57
-      // 24 25 26 27 64 65 66 67
-      // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
-      const int16x8x2_t r01_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                    vreinterpretq_s16_s32(r13_s32.val[0]));
-      const int16x8x2_t r23_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                    vreinterpretq_s16_s32(r13_s32.val[1]));
-      const int16x8x2_t r45_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                    vreinterpretq_s16_s32(r57_s32.val[0]));
-      const int16x8x2_t r67_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                    vreinterpretq_s16_s32(r57_s32.val[1]));
-      input_0 = r01_s16.val[0];
-      input_1 = r01_s16.val[1];
-      input_2 = r23_s16.val[0];
-      input_3 = r23_s16.val[1];
-      input_4 = r45_s16.val[0];
-      input_5 = r45_s16.val[1];
-      input_6 = r67_s16.val[0];
-      input_7 = r67_s16.val[1];
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }  // for
-  {
-    // from vpx_dct_sse2.c
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
-    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
-    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
-    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
-    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
-    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
-    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
-    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
-    input_0 = vhsubq_s16(input_0, sign_in0);
-    input_1 = vhsubq_s16(input_1, sign_in1);
-    input_2 = vhsubq_s16(input_2, sign_in2);
-    input_3 = vhsubq_s16(input_3, sign_in3);
-    input_4 = vhsubq_s16(input_4, sign_in4);
-    input_5 = vhsubq_s16(input_5, sign_in5);
-    input_6 = vhsubq_s16(input_6, sign_in6);
-    input_7 = vhsubq_s16(input_7, sign_in7);
-    // store results
-    store_s16q_to_tran_low(final_output + 0 * 8, input_0);
-    store_s16q_to_tran_low(final_output + 1 * 8, input_1);
-    store_s16q_to_tran_low(final_output + 2 * 8, input_2);
-    store_s16q_to_tran_low(final_output + 3 * 8, input_3);
-    store_s16q_to_tran_low(final_output + 4 * 8, input_4);
-    store_s16q_to_tran_low(final_output + 5 * 8, input_5);
-    store_s16q_to_tran_low(final_output + 6 * 8, input_6);
-    store_s16q_to_tran_low(final_output + 7 * 8, input_7);
-  }
-}
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
index 523a63c6f7..f5a044be4d 100644
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -114,3 +114,45 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
     coeff += 8;
   }
 }
+
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  int i;
+
+  /* Rearrange 32x32 to 16x64 and remove stride.
+   * Top left first. */
+  vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+                          coeff + 256);
+  /* Bottom left. */
+  vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+                          coeff + 512);
+  /* Bottom right. */
+  vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+                          coeff + 768);
+
+  for (i = 0; i < 256; i += 8) {
+    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256);
+    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
+    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
+
+    const int16x8_t b0 = vshrq_n_s16(vhaddq_s16(a0, a1), 1);
+    const int16x8_t b1 = vshrq_n_s16(vhsubq_s16(a0, a1), 1);
+    const int16x8_t b2 = vshrq_n_s16(vhaddq_s16(a2, a3), 1);
+    const int16x8_t b3 = vshrq_n_s16(vhsubq_s16(a2, a3), 1);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    store_s16q_to_tran_low(coeff + 0, c0);
+    store_s16q_to_tran_low(coeff + 256, c1);
+    store_s16q_to_tran_low(coeff + 512, c2);
+    store_s16q_to_tran_low(coeff + 768, c3);
+
+    coeff += 8;
+  }
+}
diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 0000000000..4265596c8c
--- /dev/null
+++ b/vpx_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p);
+  const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p);
+  return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4;
+}
+
+uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+  load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  sum = vaddq_u16(a0, a1);
+  sum = vaddq_u16(sum, a2);
+  sum = vaddq_u16(sum, a3);
+  sum = vaddq_u16(sum, a4);
+  sum = vaddq_u16(sum, a5);
+  sum = vaddq_u16(sum, a6);
+  sum = vaddq_u16(sum, a7);
+
+  return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
+}
+
+// coeff: 32 bits, dynamic range [-2147483648, 2147483647].
+// length: value range {16, 64, 256, 1024}.
+// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024]
+int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) {
+  int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    int32x4_t abs0, abs1;
+    const int32x4_t s0 = load_tran_low_to_s32q(coeff);
+    const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4);
+
+    abs0 = vabsq_s32(s0);
+    sum_s64[0] = vpadalq_s32(sum_s64[0], abs0);
+    abs1 = vabsq_s32(s1);
+    sum_s64[1] = vpadalq_s32(sum_s64[1], abs1);
+
+    length -= 8;
+    coeff += 8;
+  } while (length != 0);
+
+  return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1]));
+}
+
+void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+                                int dp, int *min, int *max) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+  const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+  const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+  const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+  const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+  const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+  const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+  const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+  const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+  const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+  const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+  const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+  const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+  const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+  const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+  const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+  const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
+
+  const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+  const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+  const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+  const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+  const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+  const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+  const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+  const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+  const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+  const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+  const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+  const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t min0123 = vminq_u16(min01, min23);
+  const uint16x8_t min4567 = vminq_u16(min45, min67);
+  const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if VPX_ARCH_AARCH64
+  *min = *max = 0;  // Clear high bits
+  *((uint16_t *)max) = vmaxvq_u16(max07);
+  *((uint16_t *)min) = vminvq_u16(min07);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
+  uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+  uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+  // Enough runs of vpmax/min propagate the max/min values to every position.
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u16((uint16_t *)max, ab_max, 0);
+  vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/vpx_dsp/arm/highbd_avg_pred_neon.c b/vpx_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..3063acbb3e
--- /dev/null
+++ b/vpx_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i = height;
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        uint16x8_t avg = vrhaddq_u16(p, r);
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      uint16x8_t avg = vrhaddq_u16(p, r);
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      uint16x4_t avg = vrhadd_u16(p, r);
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+}
diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c
new file mode 100644
index 0000000000..7be88f6bcb
--- /dev/null
+++ b/vpx_dsp/arm/highbd_hadamard_neon.c
@@ -0,0 +1,215 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+                                                   int16x8_t *a2, int16x8_t *a3,
+                                                   int16x8_t *a4, int16x8_t *a5,
+                                                   int16x8_t *a6,
+                                                   int16x8_t *a7) {
+  int16x8_t b0 = vaddq_s16(*a0, *a1);
+  int16x8_t b1 = vsubq_s16(*a0, *a1);
+  int16x8_t b2 = vaddq_s16(*a2, *a3);
+  int16x8_t b3 = vsubq_s16(*a2, *a3);
+  int16x8_t b4 = vaddq_s16(*a4, *a5);
+  int16x8_t b5 = vsubq_s16(*a4, *a5);
+  int16x8_t b6 = vaddq_s16(*a6, *a7);
+  int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  int16x8_t c0 = vaddq_s16(b0, b2);
+  int16x8_t c2 = vsubq_s16(b0, b2);
+  int16x8_t c1 = vaddq_s16(b1, b3);
+  int16x8_t c3 = vsubq_s16(b1, b3);
+  int16x8_t c4 = vaddq_s16(b4, b6);
+  int16x8_t c6 = vsubq_s16(b4, b6);
+  int16x8_t c5 = vaddq_s16(b5, b7);
+  int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a2 = vsubq_s16(c0, c4);
+  *a7 = vaddq_s16(c1, c5);
+  *a6 = vsubq_s16(c1, c5);
+  *a3 = vaddq_s16(c2, c6);
+  *a1 = vsubq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+}
+
+static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+                                                    int16x4_t a2, int16x4_t a3,
+                                                    int16x4_t a4, int16x4_t a5,
+                                                    int16x4_t a6, int16x4_t a7,
+                                                    tran_low_t *coeff) {
+  int32x4_t b0 = vaddl_s16(a0, a1);
+  int32x4_t b1 = vsubl_s16(a0, a1);
+  int32x4_t b2 = vaddl_s16(a2, a3);
+  int32x4_t b3 = vsubl_s16(a2, a3);
+  int32x4_t b4 = vaddl_s16(a4, a5);
+  int32x4_t b5 = vsubl_s16(a4, a5);
+  int32x4_t b6 = vaddl_s16(a6, a7);
+  int32x4_t b7 = vsubl_s16(a6, a7);
+
+  int32x4_t c0 = vaddq_s32(b0, b2);
+  int32x4_t c2 = vsubq_s32(b0, b2);
+  int32x4_t c1 = vaddq_s32(b1, b3);
+  int32x4_t c3 = vsubq_s32(b1, b3);
+  int32x4_t c4 = vaddq_s32(b4, b6);
+  int32x4_t c6 = vsubq_s32(b4, b6);
+  int32x4_t c5 = vaddq_s32(b5, b7);
+  int32x4_t c7 = vsubq_s32(b5, b7);
+
+  int32x4_t d0 = vaddq_s32(c0, c4);
+  int32x4_t d2 = vsubq_s32(c0, c4);
+  int32x4_t d7 = vaddq_s32(c1, c5);
+  int32x4_t d6 = vsubq_s32(c1, c5);
+  int32x4_t d3 = vaddq_s32(c2, c6);
+  int32x4_t d1 = vsubq_s32(c2, c6);
+  int32x4_t d4 = vaddq_s32(c3, c7);
+  int32x4_t d5 = vsubq_s32(c3, c7);
+
+  store_s32q_to_tran_low(coeff + 0, d0);
+  store_s32q_to_tran_low(coeff + 4, d1);
+  store_s32q_to_tran_low(coeff + 8, d2);
+  store_s32q_to_tran_low(coeff + 12, d3);
+  store_s32q_to_tran_low(coeff + 16, d4);
+  store_s32q_to_tran_low(coeff + 20, d5);
+  store_s32q_to_tran_low(coeff + 24, d6);
+  store_s32q_to_tran_low(coeff + 28, d7);
+}
+
+void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  int16x4_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+  int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride);
+  int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride);
+  int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
+  hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  // For the second pass we need to widen to 32-bit elements, so we're
+  // processing 4 columns at a time.
+  // Skip the second transpose because it is not required.
+
+  b0 = vget_low_s16(s0);
+  b1 = vget_low_s16(s1);
+  b2 = vget_low_s16(s2);
+  b3 = vget_low_s16(s3);
+  b4 = vget_low_s16(s4);
+  b5 = vget_low_s16(s5);
+  b6 = vget_low_s16(s6);
+  b7 = vget_low_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff);
+
+  b0 = vget_high_s16(s0);
+  b1 = vget_high_s16(s1);
+  b2 = vget_high_s16(s2);
+  b3 = vget_high_s16(s3);
+  b4 = vget_high_s16(s4);
+  b5 = vget_high_s16(s5);
+  b6 = vget_high_s16(s6);
+  b7 = vget_high_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
+}
+
+void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int i = 0;
+
+  // Rearrange 16x16 to 8x32 and remove stride.
+  // Top left first.
+  vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff);
+  // Top right.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64);
+  // Bottom left.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride,
+                               coeff + 128);
+  // Bottom right.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride,
+                               coeff + 192);
+
+  do {
+    int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+    int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64);
+    int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128);
+    int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192);
+
+    int32x4_t b0 = vhaddq_s32(a0, a1);
+    int32x4_t b1 = vhsubq_s32(a0, a1);
+    int32x4_t b2 = vhaddq_s32(a2, a3);
+    int32x4_t b3 = vhsubq_s32(a2, a3);
+
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
+
+    store_s32q_to_tran_low(coeff + 4 * i, c0);
+    store_s32q_to_tran_low(coeff + 4 * i + 64, c1);
+    store_s32q_to_tran_low(coeff + 4 * i + 128, c2);
+    store_s32q_to_tran_low(coeff + 4 * i + 192, c3);
+  } while (++i < 16);
+}
+
+void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int i = 0;
+
+  // Rearrange 32x32 to 16x64 and remove stride.
+  // Top left first.
+  vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff);
+  // Top right.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256);
+  // Bottom left.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride,
+                                 coeff + 512);
+  // Bottom right.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride,
+                                 coeff + 768);
+
+  do {
+    int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+    int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256);
+    int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512);
+    int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768);
+
+    int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+    int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+    int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+    int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
+
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
+
+    store_s32q_to_tran_low(coeff + 4 * i, c0);
+    store_s32q_to_tran_low(coeff + 4 * i + 256, c1);
+    store_s32q_to_tran_low(coeff + 4 * i + 512, c2);
+    store_s32q_to_tran_low(coeff + 4 * i + 768, c3);
+  } while (++i < 64);
+}
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index 6f7e5da762..235cb5b996 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -12,23 +12,22 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "sum_neon.h"
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
 // DC 4x4
 
-static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
+static INLINE uint16_t dc_sum_4(const uint16_t *ref) {
   const uint16x4_t ref_u16 = vld1_u16(ref);
-  const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
-  return vpadd_u16(p0, p0);
+  return horizontal_add_uint16x4(ref_u16);
 }
 
 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
                                 const uint16x4_t dc) {
-  const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    vst1_u16(dst, dc_dup);
+    vst1_u16(dst, dc);
   }
 }
 
@@ -37,21 +36,17 @@ void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                       const uint16_t *left, int bd) {
   const uint16x4_t a = vld1_u16(above);
   const uint16x4_t l = vld1_u16(left);
-  uint16x4_t sum;
-  uint16x4_t dc;
+  const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l));
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3);
   (void)bd;
-  sum = vadd_u16(a, l);
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vrshr_n_u16(sum, 3);
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_4(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  const uint16_t sum = dc_sum_4(left);
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
   (void)above;
   (void)bd;
   dc_store_4x4(dst, stride, dc);
@@ -60,8 +55,8 @@ void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_4(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  const uint16_t sum = dc_sum_4(above);
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
   (void)left;
   (void)bd;
   dc_store_4x4(dst, stride, dc);
@@ -79,19 +74,16 @@ void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 8x8
 
-static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
+static INLINE uint16_t dc_sum_8(const uint16_t *ref) {
   const uint16x8_t ref_u16 = vld1q_u16(ref);
-  uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+  return horizontal_add_uint16x8(ref_u16);
 }
 
 static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
-                                const uint16x4_t dc) {
-  const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+                                const uint16x8_t dc) {
   int i;
   for (i = 0; i < 8; ++i, dst += stride) {
-    vst1q_u16(dst, dc_dup);
+    vst1q_u16(dst, dc);
   }
 }
 
@@ -101,20 +93,17 @@ void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
   const uint16x8_t above_u16 = vld1q_u16(above);
   const uint16x8_t left_u16 = vld1q_u16(left);
   const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  uint16x4_t dc;
+  const uint16_t sum = horizontal_add_uint16x8(p0);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)bd;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vrshr_n_u16(sum, 4);
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_8(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  const uint16_t sum = dc_sum_8(left);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
   (void)above;
   (void)bd;
   dc_store_8x8(dst, stride, dc);
@@ -123,8 +112,8 @@ void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_8(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  const uint16_t sum = dc_sum_8(above);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
   (void)left;
   (void)bd;
   dc_store_8x8(dst, stride, dc);
@@ -133,7 +122,7 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_8x8(dst, stride, dc);
@@ -142,47 +131,43 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
-  const uint16x8x2_t ref_u16 = vld2q_u16(ref);
-  const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_16(const uint16_t *ref) {
+  const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0);
+  const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8);
+  const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1);
+  return horizontal_add_uint16x8(p0);
 }
 
 static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
-                                  const uint16x4_t dc) {
-  uint16x8x2_t dc_dup;
+                                  const uint16x8_t dc) {
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
   for (i = 0; i < 16; ++i, dst += stride) {
-    vst2q_u16(dst, dc_dup);
+    vst1q_u16(dst + 0, dc);
+    vst1q_u16(dst + 8, dc);
   }
 }
 
 void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  const uint16x8x2_t a = vld2q_u16(above);
-  const uint16x8x2_t l = vld2q_u16(left);
-  const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
-  const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t a0 = vld1q_u16(above + 0);
+  const uint16x8_t a1 = vld1q_u16(above + 8);
+  const uint16x8_t l0 = vld1q_u16(left + 0);
+  const uint16x8_t l1 = vld1q_u16(left + 8);
+  const uint16x8_t pa = vaddq_u16(a0, a1);
+  const uint16x8_t pl = vaddq_u16(l0, l1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);
-  uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
-  uint32x2_t sum;
-  uint16x4_t dc;
+  const uint32_t sum = horizontal_add_uint16x8(pal0);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)bd;
-  pal1 = vpadd_u16(pal1, pal1);
-  sum = vpaddl_u16(pal1);
-  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
   dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_16(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  const uint16_t sum = dc_sum_16(left);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)above;
   (void)bd;
   dc_store_16x16(dst, stride, dc);
@@ -191,8 +176,8 @@ void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_16(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  const uint16_t sum = dc_sum_16(above);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)left;
   (void)bd;
   dc_store_16x16(dst, stride, dc);
@@ -201,7 +186,7 @@ void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_16x16(dst, stride, dc);
@@ -210,56 +195,58 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
-  const uint16x8x4_t r = vld4q_u16(ref);
-  const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
-  const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
+static INLINE uint32_t dc_sum_32(const uint16_t *ref) {
+  const uint16x8_t r0 = vld1q_u16(ref + 0);
+  const uint16x8_t r1 = vld1q_u16(ref + 8);
+  const uint16x8_t r2 = vld1q_u16(ref + 16);
+  const uint16x8_t r3 = vld1q_u16(ref + 24);
+  const uint16x8_t p0 = vaddq_u16(r0, r1);
+  const uint16x8_t p1 = vaddq_u16(r2, r3);
   const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  sum = vpadd_u16(sum, sum);
-  return vpaddl_u16(sum);
+  return horizontal_add_uint16x8(p2);
 }
 
 static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
-                                  const uint16x4_t dc) {
-  uint16x8x2_t dc_dup;
+                                  const uint16x8_t dc) {
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
-
   for (i = 0; i < 32; ++i) {
-    vst2q_u16(dst, dc_dup);
-    dst += 16;
-    vst2q_u16(dst, dc_dup);
-    dst += stride - 16;
+    vst1q_u16(dst + 0, dc);
+    vst1q_u16(dst + 8, dc);
+    vst1q_u16(dst + 16, dc);
+    vst1q_u16(dst + 24, dc);
+    dst += stride;
   }
 }
 
 void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  const uint16x8x4_t a = vld4q_u16(above);
-  const uint16x8x4_t l = vld4q_u16(left);
-  const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
-  const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
-  const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
-  const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
+  const uint16x8_t a0 = vld1q_u16(above + 0);
+  const uint16x8_t a1 = vld1q_u16(above + 8);
+  const uint16x8_t a2 = vld1q_u16(above + 16);
+  const uint16x8_t a3 = vld1q_u16(above + 24);
+  const uint16x8_t l0 = vld1q_u16(left + 0);
+  const uint16x8_t l1 = vld1q_u16(left + 8);
+  const uint16x8_t l2 = vld1q_u16(left + 16);
+  const uint16x8_t l3 = vld1q_u16(left + 24);
+  const uint16x8_t pa0 = vaddq_u16(a0, a1);
+  const uint16x8_t pa1 = vaddq_u16(a2, a3);
+  const uint16x8_t pl0 = vaddq_u16(l0, l1);
+  const uint16x8_t pl1 = vaddq_u16(l2, l3);
   const uint16x8_t pa = vaddq_u16(pa0, pa1);
   const uint16x8_t pl = vaddq_u16(pl0, pl1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);
-  const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
-  uint32x2_t sum = vpaddl_u16(pal1);
-  uint16x4_t dc;
+  const uint32_t sum = horizontal_add_uint16x8(pal0);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0);
   (void)bd;
-  sum = vpadd_u32(sum, sum);
-  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
   dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
-  const uint32x2_t sum = dc_sum_32(left);
-  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  const uint32_t sum = dc_sum_32(left);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)above;
   (void)bd;
   dc_store_32x32(dst, stride, dc);
@@ -268,8 +255,8 @@ void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint32x2_t sum = dc_sum_32(above);
-  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  const uint32_t sum = dc_sum_32(above);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)left;
   (void)bd;
   dc_store_32x32(dst, stride, dc);
@@ -278,7 +265,7 @@ void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_32x32(dst, stride, dc);
@@ -289,166 +276,1304 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8_t ABCDEFGH = vld1q_u16(above);
-  const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
-  const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
-  const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
-  const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
-  const uint16x4_t avg2_low = vget_low_u16(avg2);
-  const uint16x4_t avg2_high = vget_high_u16(avg2);
-  const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
-  const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
-  const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+  uint16x8_t a0, a1, a2, d0;
+  uint16_t a7;
   (void)left;
   (void)bd;
-  vst1_u16(dst, avg2_low);
-  dst += stride;
-  vst1_u16(dst, r1);
-  dst += stride;
-  vst1_u16(dst, r2);
-  dst += stride;
-  vst1_u16(dst, r3);
-  vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
-}
 
-static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
-                               const uint16x8_t above_right, uint16x8_t *row) {
-  *row = vextq_u16(*row, above_right, 1);
-  vst1q_u16(*dst, *row);
-  *dst += stride;
+  a0 = vld1q_u16(above);
+  a7 = above[7];
+
+  // [ above[1], ..., above[6], x, x ]
+  a1 = vextq_u16(a0, a0, 1);
+  // [ above[2], ..., above[7], x, x ]
+  a2 = vextq_u16(a0, a0, 2);
+
+  // d0[0] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[5] = AVG3(above[5], above[6], above[7]);
+  // d0[6] = x (don't care)
+  // d0[7] = x (don't care)
+  d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+  // We want:
+  // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
+  // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
+  // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
+  // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+  vst1_u16(dst + 0 * stride, vget_low_u16(d0));
+  vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1)));
+  vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2)));
+  vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3)));
+
+  // We stored d0[6] above, so fixup into above[7].
+  dst[3 * stride + 3] = a7;
 }
 
 void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8_t A0 = vld1q_u16(above);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
-  const uint16x8_t A1 = vld1q_u16(above + 1);
-  const uint16x8_t A2 = vld1q_u16(above + 2);
-  const uint16x8_t avg1 = vhaddq_u16(A0, A2);
-  uint16x8_t row = vrhaddq_u16(avg1, A1);
+  uint16x8_t ax0, a0, a1, a7, d0;
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row);
-  dst += stride;
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  vst1q_u16(dst, above_right);
-}
-
-static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
-                                const uint16x8_t above_right, uint16x8_t *row_0,
-                                uint16x8_t *row_1) {
-  *row_0 = vextq_u16(*row_0, *row_1, 1);
-  *row_1 = vextq_u16(*row_1, above_right, 1);
-  vst1q_u16(*dst, *row_0);
-  *dst += 8;
-  vst1q_u16(*dst, *row_1);
-  *dst += stride - 8;
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_dup_u16(above + 7);
+
+  // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+  // shift in above[7] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[7] = AVG3(above[6], above[7], above[8]);
+  d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[7].
+  vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1));
+  vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2));
+  vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7));
+  vst1q_u16(dst + 7 * stride, a7);
 }
 
 void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  const uint16x8_t A0_0 = vld1q_u16(above);
-  const uint16x8_t A0_1 = vld1q_u16(above + 8);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
-  const uint16x8_t A1_0 = vld1q_u16(above + 1);
-  const uint16x8_t A1_1 = vld1q_u16(above + 9);
-  const uint16x8_t A2_0 = vld1q_u16(above + 2);
-  const uint16x8_t A2_1 = vld1q_u16(above + 10);
-  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
-  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
-  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
-  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2];
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row_0);
-  vst1q_u16(dst + 8, row_1);
-  dst += stride;
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  vst1q_u16(dst, above_right);
-  vst1q_u16(dst + 8, above_right);
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a15 = vld1q_dup_u16(above + 15);
+
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  // We have one unused lane here to leave room to shift in above[15] in the
+  // last lane:
+  // d0[0][1] = x (don't care)
+  // d0[0][1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[0][7] = AVG3(above[6], above[7], above[8]);
+  // d0[1][0] = AVG3(above[7], above[8], above[9]);
+  // ...
+  // d0[1][7] = AVG3(above[14], above[15], above[16]);
+  d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+  d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+
+  // Incrementally shift in duplicates of above[15].
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4));
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7));
+  vst1q_u16(dst + 7 * stride + 0, d0[1]);
+  vst1q_u16(dst + 7 * stride + 8, a15);
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1));
+  vst1q_u16(dst + 8 * stride + 8, a15);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2));
+  vst1q_u16(dst + 9 * stride + 8, a15);
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3));
+  vst1q_u16(dst + 10 * stride + 8, a15);
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4));
+  vst1q_u16(dst + 11 * stride + 8, a15);
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5));
+  vst1q_u16(dst + 12 * stride + 8, a15);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6));
+  vst1q_u16(dst + 13 * stride + 8, a15);
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7));
+  vst1q_u16(dst + 14 * stride + 8, a15);
+  vst1q_u16(dst + 15 * stride + 0, a15);
+  vst1q_u16(dst + 15 * stride + 8, a15);
 }
 
 void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  const uint16x8_t A0_0 = vld1q_u16(above);
-  const uint16x8_t A0_1 = vld1q_u16(above + 8);
-  const uint16x8_t A0_2 = vld1q_u16(above + 16);
-  const uint16x8_t A0_3 = vld1q_u16(above + 24);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
-  const uint16x8_t A1_0 = vld1q_u16(above + 1);
-  const uint16x8_t A1_1 = vld1q_u16(above + 9);
-  const uint16x8_t A1_2 = vld1q_u16(above + 17);
-  const uint16x8_t A1_3 = vld1q_u16(above + 25);
-  const uint16x8_t A2_0 = vld1q_u16(above + 2);
-  const uint16x8_t A2_1 = vld1q_u16(above + 10);
-  const uint16x8_t A2_2 = vld1q_u16(above + 18);
-  const uint16x8_t A2_3 = vld1q_u16(above + 26);
-  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
-  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
-  const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
-  const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
-  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
-  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
-  uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
-  uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+  uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4];
   int i;
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row_0);
-  dst += 8;
-  vst1q_u16(dst, row_1);
-  dst += 8;
-  vst1q_u16(dst, row_2);
-  dst += 8;
-  vst1q_u16(dst, row_3);
-  dst += stride - 24;
-
-  for (i = 0; i < 30; ++i) {
-    row_0 = vextq_u16(row_0, row_1, 1);
-    row_1 = vextq_u16(row_1, row_2, 1);
-    row_2 = vextq_u16(row_2, row_3, 1);
-    row_3 = vextq_u16(row_3, above_right, 1);
-    vst1q_u16(dst, row_0);
-    dst += 8;
-    vst1q_u16(dst, row_1);
-    dst += 8;
-    vst1q_u16(dst, row_2);
-    dst += 8;
-    vst1q_u16(dst, row_3);
-    dst += stride - 24;
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a17 = vld1q_u16(above + 17);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  a25 = vld1q_u16(above + 25);
+  a31 = vld1q_dup_u16(above + 31);
+
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+  d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+  d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16);
+  d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24);
+
+  for (i = 0; i < 32; ++i) {
+    d0[0] = vextq_u16(d0[0], d0[1], 1);
+    d0[1] = vextq_u16(d0[1], d0[2], 1);
+    d0[2] = vextq_u16(d0[2], d0[3], 1);
+    d0[3] = vextq_u16(d0[3], a31, 1);
+    vst1q_u16(dst + 0, d0[0]);
+    vst1q_u16(dst + 8, d0[1]);
+    vst1q_u16(dst + 16, d0[2]);
+    vst1q_u16(dst + 24, d0[3]);
+    dst += stride;
   }
+}
 
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1_u16(above + 0);
+  a1 = vld1_u16(above + 1);
+  a2 = vld1_u16(above + 2);
+  a3 = vld1_u16(above + 3);
+
+  d0 = vrhadd_u16(a0, a1);
+  d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
+  d2 = vrhadd_u16(a1, a2);
+  d3 = vrhadd_u16(vhadd_u16(a1, a3), a2);
+
+  // Note that here we are performing a full avg calculation for the final
+  // elements rather than storing a duplicate of above[3], which differs
+  // (correctly) from the general scheme employed by the bs={8,16,32}
+  // implementations in order to match the original C implementation.
+  vst1_u16(dst + 0 * stride, d0);
+  vst1_u16(dst + 1 * stride, d1);
+  vst1_u16(dst + 2 * stride, d2);
+  vst1_u16(dst + 3 * stride, d3);
+}
+
+void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a7 = vld1q_dup_u16(above + 7);
+
+  d0 = vrhaddq_u16(a0, a1);
+  d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+  // We want to store:
+  // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7] ]
+  // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7] ]
+  // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7] ]
+  // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7] ]
+  // stride=6 [ d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7],  a[7] ]
+  // stride=7 [ d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7],  a[7] ]
+  // Note in particular that d0[7] and d1[7] are only ever referenced in the
+  // stride=0 and stride=1 cases respectively, and in later strides are
+  // replaced by a copy of above[7]. These are equivalent if for i>7,
+  // above[i]==above[7], however that is not always the case.
+
+  // Strip out d0[7] and d1[7] so that we can replace it with an additional
+  // copy of above[7], the first vector here doesn't matter so just reuse
+  // d0/d1.
+  d0_ext = vextq_u16(d0, d0, 7);
+  d1_ext = vextq_u16(d1, d1, 7);
+
+  // Shuffle in duplicates of above[7] and store.
+  vst1q_u16(dst + 0 * stride, d0);
+  vst1q_u16(dst + 1 * stride, d1);
+  vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4));
+  vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4));
+}
+
+void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+  uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a15 = vld1q_dup_u16(above + 15);
+
+  d0[0] = vrhaddq_u16(a0, a1);
+  d0[1] = vrhaddq_u16(a8, a9);
+  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+
+  // Strip out the final element of d0/d1 so that we can replace it with an
+  // additional copy of above[7], the first vector here doesn't matter so just
+  // reuse the same vector.
+  d0_ext = vextq_u16(d0[1], d0[1], 7);
+  d1_ext = vextq_u16(d1[1], d1[1], 7);
+
+  // Shuffle in duplicates of above[7] and store. Note that cases involving
+  // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+  // element from above.
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2));
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4));
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6));
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 14 * stride + 8, a15);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 15 * stride + 8, a15);
+}
+
+void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+  uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
+      d1[4], d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a16 = vld1q_u16(above + 16);
+  a17 = vld1q_u16(above + 17);
+  a18 = vld1q_u16(above + 18);
+  a24 = vld1q_u16(above + 24);
+  a25 = vld1q_u16(above + 25);
+  a26 = vld1q_u16(above + 26);
+  a31 = vld1q_dup_u16(above + 31);
+
+  d0[0] = vrhaddq_u16(a0, a1);
+  d0[1] = vrhaddq_u16(a8, a9);
+  d0[2] = vrhaddq_u16(a16, a17);
+  d0[3] = vrhaddq_u16(a24, a25);
+  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
+
+  // Strip out the final element of d0/d1 so that we can replace it with an
+  // additional copy of above[7], the first vector here doesn't matter so just
+  // reuse the same vector.
+  d0_ext = vextq_u16(d0[3], d0[3], 7);
+  d1_ext = vextq_u16(d1[3], d1[3], 7);
+
+  // Shuffle in duplicates of above[7] and store. Note that cases involving
+  // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+  // element from above.
+
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 0 * stride + 16, d0[2]);
+  vst1q_u16(dst + 0 * stride + 24, d0[3]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 1 * stride + 16, d1[2]);
+  vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 14 * stride + 24, a31);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 15 * stride + 24, a31);
+
+  vst1q_u16(dst + 16 * stride + 0, d0[1]);
+  vst1q_u16(dst + 16 * stride + 8, d0[2]);
+  vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1));
+  vst1q_u16(dst + 16 * stride + 24, a31);
+  vst1q_u16(dst + 17 * stride + 0, d1[1]);
+  vst1q_u16(dst + 17 * stride + 8, d1[2]);
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1));
+  vst1q_u16(dst + 17 * stride + 24, a31);
+
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2));
+  vst1q_u16(dst + 18 * stride + 24, a31);
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2));
+  vst1q_u16(dst + 19 * stride + 24, a31);
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3));
+  vst1q_u16(dst + 20 * stride + 24, a31);
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3));
+  vst1q_u16(dst + 21 * stride + 24, a31);
+
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4));
+  vst1q_u16(dst + 22 * stride + 24, a31);
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4));
+  vst1q_u16(dst + 23 * stride + 24, a31);
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5));
+  vst1q_u16(dst + 24 * stride + 24, a31);
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5));
+  vst1q_u16(dst + 25 * stride + 24, a31);
+
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6));
+  vst1q_u16(dst + 26 * stride + 24, a31);
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6));
+  vst1q_u16(dst + 27 * stride + 24, a31);
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7));
+  vst1q_u16(dst + 28 * stride + 24, a31);
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7));
+  vst1q_u16(dst + 29 * stride + 24, a31);
+
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 30 * stride + 16, a31);
+  vst1q_u16(dst + 30 * stride + 24, a31);
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 31 * stride + 16, a31);
+  vst1q_u16(dst + 31 * stride + 24, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+  (void)bd;
+
+  az = vld1_u16(above - 1);
+  a0 = vld1_u16(above + 0);
+  // [ left[0], above[-1], above[0], above[1] ]
+  l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+  l0 = vld1_u16(left + 0);
+  // The last lane here is unused, reading left[4] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], left[2], left[3], x ]
+  l1 = vext_u16(l0, l0, 1);
+  // [ above[-1], left[0], left[1], left[2] ]
+  azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+  d0 = vrhadd_u16(az, a0);
+  d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+
+  col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+  col0_even = vdup_lane_u16(col0, 0);
+  col0_odd = vdup_lane_u16(col0, 1);
+
+  vst1_u16(dst + 0 * stride, d0);
+  vst1_u16(dst + 1 * stride, d1);
+  vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3));
+  vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3));
+}
+
+void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vextq_u16(l0, l0, 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], above[0])
+  // ...
+  // d0[7] = AVG2(above[6], above[7])
+  d0 = vrhaddq_u16(az, a0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vector to put the elements to be shifted in
+  // at the end:
+  // col0[7] = AVG3(above[-1], left[0], left[1])
+  // col0[6] = AVG3(left[0], left[1], left[2])
+  // ...
+  // col0[0] = AVG3(left[6], left[7], left[8])
+  col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0 = vrev64q_u16(vextq_u16(col0, col0, 4));
+
+  // We don't care about the first parameter to this uzp since we only ever use
+  // the high three elements, we just use col0 again since it is already
+  // available:
+  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+  col0_even = vuzpq_u16(col0, col0).val[1];
+  col0_odd = vuzpq_u16(col0, col0).val[0];
+
+  // Incrementally shift more elements from col0 into d0/1:
+  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
+  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
+  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
+  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  vst1q_u16(dst + 0 * stride, d0);
+  vst1q_u16(dst + 1 * stride, d1);
+  vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7));
+  vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7));
+  vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6));
+  vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6));
+  vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5));
+  vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5));
+}
+
+void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo,
+      col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[8] to avoid needing to
+  // materialize a zero:
+  // [ left[9], ... , left[15], x ]
+  l9 = vextq_u16(l8, l8, 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0_lo = vrhaddq_u16(az, a0);
+  d0_hi = vrhaddq_u16(a7, a8);
+  d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+
+  col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+  // Reverse within each vector, then swap the array indices in the uzp to
+  // complete the reversal across all 16 elements.
+  col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4));
+  col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4));
+  col0_even = vuzpq_u16(col0_hi, col0_lo).val[1];
+  col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0];
+
+  vst1q_u16(dst + 0 * stride + 0, d0_lo);
+  vst1q_u16(dst + 0 * stride + 8, d0_hi);
+  vst1q_u16(dst + 1 * stride + 0, d1_lo);
+  vst1q_u16(dst + 1 * stride + 8, d1_hi);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1));
+}
+
+void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+      l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4],
+      col0_even[2], col0_odd[2];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a14 = vld1q_u16(above + 14);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a22 = vld1q_u16(above + 22);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l15 = vld1q_u16(left + 15);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l23 = vld1q_u16(left + 23);
+  l24 = vld1q_u16(left + 24);
+  l25 = vld1q_u16(left + 25);
+  // The last lane here is unused, reading left[32] could cause a buffer
+  // over-read, so just fill with a duplicate of left[24] to avoid needing to
+  // materialize a zero:
+  // [ left[25], ... , left[31], x ]
+  l25 = vextq_u16(l24, l24, 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(az, a0);
+  d0[1] = vrhaddq_u16(a7, a8);
+  d0[2] = vrhaddq_u16(a15, a16);
+  d0[3] = vrhaddq_u16(a23, a24);
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+  col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+  col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+  col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+  // Reverse within each vector, then swap the array indices in both the uzp
+  // and the col0_{even,odd} assignment to complete the reversal across all
+  // 32-elements.
+  col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4));
+  col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4));
+  col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4));
+  col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4));
+
+  col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1];
+  col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1];
+  col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0];
+  col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0];
+
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 0 * stride + 16, d0[2]);
+  vst1q_u16(dst + 0 * stride + 24, d0[3]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 1 * stride + 16, d1[2]);
+  vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+  vst1q_u16(dst + 16 * stride + 0, col0_even[1]);
+  vst1q_u16(dst + 16 * stride + 8, d0[0]);
+  vst1q_u16(dst + 16 * stride + 16, d0[1]);
+  vst1q_u16(dst + 16 * stride + 24, d0[2]);
+  vst1q_u16(dst + 17 * stride + 0, col0_odd[1]);
+  vst1q_u16(dst + 17 * stride + 8, d1[0]);
+  vst1q_u16(dst + 17 * stride + 16, d1[1]);
+  vst1q_u16(dst + 17 * stride + 24, d1[2]);
+
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6));
+
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4));
+
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2));
+
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi;
+  (void)bd;
+
+  az = vld1_u16(above - 1);
+  a0 = vld1_u16(above + 0);
+  // [ left[0], above[-1], above[0], above[1] ]
+  l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+  l0 = vld1_u16(left);
+  // The last lane here is unused, reading left[4] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], left[2], left[3], x ]
+  l1 = vext_u16(l0, l0, 1);
+  // [ above[-1], left[0], left[1], left[2] ]
+  azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+  d0 = vrhadd_u16(azl0, l0);
+  d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+  d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+
+  d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0];
+  d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1] ]
+  vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3));
+  vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1));
+  vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3));
+  vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo,
+      d20_hi;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vextq_u16(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], left[0])
+  // d0[1] = AVG2(left[0], left[1])
+  // ...
+  // d0[7] = AVG2(left[6], left[7])
+  d0 = vrhaddq_u16(azl0, l0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+  // d2[0] = AVG3(above[-1], left[0], left[1])
+  // d2[1] = AVG3(left[0], left[1], left[2])
+  // ...
+  // d2[7] = AVG3(left[6], left[7], left[8])
+  d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vectors to put the elements to be shifted
+  // in at the end:
+  d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4));
+  d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4));
+
+  d20_lo = vzipq_u16(d2_rev, d0_rev).val[0];
+  d20_hi = vzipq_u16(d2_rev, d0_rev).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+  // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+  // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+  // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+  // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+  vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7));
+  vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5));
+  vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3));
+  vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2],
+      d2[2], d20[4];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[8] to avoid needing to
+  // materialize a zero:
+  // [ left[9], ... , left[15], x ]
+  l9 = vextq_u16(l8, l8, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(azl0, l0);
+  d0[1] = vrhaddq_u16(l7, l8);
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+  d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+  d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+  d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+  d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+
+  d20[0] = vzipq_u16(d2[1], d0[1]).val[0];
+  d20[1] = vzipq_u16(d2[1], d0[1]).val[1];
+  d20[2] = vzipq_u16(d2[0], d0[0]).val[0];
+  d20[3] = vzipq_u16(d2[0], d0[0]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+}
+
+void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+      l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a14 = vld1q_u16(above + 14);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a22 = vld1q_u16(above + 22);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l15 = vld1q_u16(left + 15);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l23 = vld1q_u16(left + 23);
+  l24 = vld1q_u16(left + 24);
+  // The last lane here is unused, reading left[32] could cause a buffer
+  // over-read, so just fill with a duplicate of left[24] to avoid needing to
+  // materialize a zero:
+  // [ left[25], ... , left[31], x ]
+  l25 = vextq_u16(l24, l24, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(azl0, l0);
+  d0[1] = vrhaddq_u16(l7, l8);
+  d0[2] = vrhaddq_u16(l15, l16);
+  d0[3] = vrhaddq_u16(l23, l24);
+
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+  d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+  d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+  d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+  d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+  d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+  d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4));
+  d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4));
+  d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+  d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+  d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4));
+  d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4));
+
+  d20[0] = vzipq_u16(d2[3], d0[3]).val[0];
+  d20[1] = vzipq_u16(d2[3], d0[3]).val[1];
+  d20[2] = vzipq_u16(d2[2], d0[2]).val[0];
+  d20[3] = vzipq_u16(d2[2], d0[2]).val[1];
+  d20[4] = vzipq_u16(d2[1], d0[1]).val[0];
+  d20[5] = vzipq_u16(d2[1], d0[1]).val[1];
+  d20[6] = vzipq_u16(d2[0], d0[0]).val[0];
+  d20[7] = vzipq_u16(d2[0], d0[0]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1));
+
+  vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1));
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1));
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1));
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1));
 }
 
 // -----------------------------------------------------------------------------
@@ -696,6 +1821,311 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 
 //------------------------------------------------------------------------------
 
+void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi;
+  (void)above;
+  (void)bd;
+
+  l0 = vld1_u16(left + 0);
+  l3 = vld1_dup_u16(left + 3);
+
+  // [ left[1], left[2], left[3], left[3] ]
+  l1 = vext_u16(l0, l3, 1);
+  // [ left[2], left[3], left[3], left[3] ]
+  l2 = vext_u16(l0, l3, 2);
+
+  c0 = vrhadd_u16(l0, l1);
+  c1 = vrhadd_u16(vhadd_u16(l0, l2), l1);
+
+  c01_lo = vzip_u16(c0, c1).val[0];
+  c01_hi = vzip_u16(c0, c1).val[1];
+
+  // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
+  // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
+  // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
+  // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+  vst1_u16(dst + 0 * stride, c01_lo);
+  vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2));
+  vst1_u16(dst + 2 * stride, c01_hi);
+  vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2));
+}
+
+void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi;
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l7 = vld1q_dup_u16(left + 7);
+
+  // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+  l1 = vextq_u16(l0, l7, 1);
+  // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+  l2 = vextq_u16(l0, l7, 2);
+
+  c0 = vrhaddq_u16(l0, l1);
+  c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+
+  c01_lo = vzipq_u16(c0, c1).val[0];
+  c01_hi = vzipq_u16(c0, c1).val[1];
+
+  vst1q_u16(dst + 0 * stride, c01_lo);
+  vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2));
+  vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4));
+  vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6));
+  vst1q_u16(dst + 4 * stride, c01_hi);
+  vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2));
+  vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4));
+  vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6));
+}
+
+void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4];
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l2 = vld1q_u16(left + 2);
+  l8 = vld1q_u16(left + 8);
+  l15 = vld1q_dup_u16(left + 15);
+
+  l9 = vextq_u16(l8, l15, 1);
+  l10 = vextq_u16(l8, l15, 2);
+
+  c0[0] = vrhaddq_u16(l0, l1);
+  c0[1] = vrhaddq_u16(l8, l9);
+  c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+  c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+
+  c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+  c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+  c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+  c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, c01[0]);
+  vst1q_u16(dst + 0 * stride + 8, c01[1]);
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+
+  vst1q_u16(dst + 4 * stride + 0, c01[1]);
+  vst1q_u16(dst + 4 * stride + 8, c01[2]);
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+
+  vst1q_u16(dst + 8 * stride + 0, c01[2]);
+  vst1q_u16(dst + 8 * stride + 8, c01[3]);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6));
+
+  vst1q_u16(dst + 12 * stride + 0, c01[3]);
+  vst1q_u16(dst + 12 * stride + 8, l15);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2));
+  vst1q_u16(dst + 13 * stride + 8, l15);
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4));
+  vst1q_u16(dst + 14 * stride + 8, l15);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6));
+  vst1q_u16(dst + 15 * stride + 8, l15);
+}
+
+void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4],
+      c1[4], c01[8];
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l2 = vld1q_u16(left + 2);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l10 = vld1q_u16(left + 10);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l18 = vld1q_u16(left + 18);
+  l24 = vld1q_u16(left + 24);
+  l31 = vld1q_dup_u16(left + 31);
+
+  l25 = vextq_u16(l24, l31, 1);
+  l26 = vextq_u16(l24, l31, 2);
+
+  c0[0] = vrhaddq_u16(l0, l1);
+  c0[1] = vrhaddq_u16(l8, l9);
+  c0[2] = vrhaddq_u16(l16, l17);
+  c0[3] = vrhaddq_u16(l24, l25);
+  c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+  c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+  c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17);
+  c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25);
+
+  c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+  c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+  c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+  c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+  c01[4] = vzipq_u16(c0[2], c1[2]).val[0];
+  c01[5] = vzipq_u16(c0[2], c1[2]).val[1];
+  c01[6] = vzipq_u16(c0[3], c1[3]).val[0];
+  c01[7] = vzipq_u16(c0[3], c1[3]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, c01[0]);
+  vst1q_u16(dst + 0 * stride + 8, c01[1]);
+  vst1q_u16(dst + 0 * stride + 16, c01[2]);
+  vst1q_u16(dst + 0 * stride + 24, c01[3]);
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6));
+
+  vst1q_u16(dst + 4 * stride + 0, c01[1]);
+  vst1q_u16(dst + 4 * stride + 8, c01[2]);
+  vst1q_u16(dst + 4 * stride + 16, c01[3]);
+  vst1q_u16(dst + 4 * stride + 24, c01[4]);
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6));
+
+  vst1q_u16(dst + 8 * stride + 0, c01[2]);
+  vst1q_u16(dst + 8 * stride + 8, c01[3]);
+  vst1q_u16(dst + 8 * stride + 16, c01[4]);
+  vst1q_u16(dst + 8 * stride + 24, c01[5]);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6));
+
+  vst1q_u16(dst + 12 * stride + 0, c01[3]);
+  vst1q_u16(dst + 12 * stride + 8, c01[4]);
+  vst1q_u16(dst + 12 * stride + 16, c01[5]);
+  vst1q_u16(dst + 12 * stride + 24, c01[6]);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6));
+
+  vst1q_u16(dst + 16 * stride + 0, c01[4]);
+  vst1q_u16(dst + 16 * stride + 8, c01[5]);
+  vst1q_u16(dst + 16 * stride + 16, c01[6]);
+  vst1q_u16(dst + 16 * stride + 24, c01[7]);
+  vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6));
+
+  vst1q_u16(dst + 20 * stride + 0, c01[5]);
+  vst1q_u16(dst + 20 * stride + 8, c01[6]);
+  vst1q_u16(dst + 20 * stride + 16, c01[7]);
+  vst1q_u16(dst + 20 * stride + 24, l31);
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6));
+
+  vst1q_u16(dst + 24 * stride + 0, c01[6]);
+  vst1q_u16(dst + 24 * stride + 8, c01[7]);
+  vst1q_u16(dst + 24 * stride + 16, l31);
+  vst1q_u16(dst + 24 * stride + 24, l31);
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6));
+
+  vst1q_u16(dst + 28 * stride + 0, c01[7]);
+  vst1q_u16(dst + 28 * stride + 8, l31);
+  vst1q_u16(dst + 28 * stride + 16, l31);
+  vst1q_u16(dst + 28 * stride + 24, l31);
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6));
+}
+
+//------------------------------------------------------------------------------
+
 void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *above,
                                      const uint16_t *left, int bd) {
@@ -725,30 +2155,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row = vld2q_u16(above);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
   int i;
   (void)left;
   (void)bd;
 
-  for (i = 0; i < 16; i++, dst += stride) {
-    vst2q_u16(dst, row);
+  for (i = 0; i < 16; i++) {
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    dst += stride;
   }
 }
 
 void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row0 = vld2q_u16(above);
-  const uint16x8x2_t row1 = vld2q_u16(above + 16);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
+  const uint16x8_t row2 = vld1q_u16(above + 16);
+  const uint16x8_t row3 = vld1q_u16(above + 24);
   int i;
   (void)left;
   (void)bd;
 
   for (i = 0; i < 32; i++) {
-    vst2q_u16(dst, row0);
-    dst += 16;
-    vst2q_u16(dst, row1);
-    dst += stride - 16;
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    vst1q_u16(dst + 16, row2);
+    vst1q_u16(dst + 24, row3);
+    dst += stride;
   }
 }
 
diff --git a/vpx_dsp/arm/highbd_loopfilter_neon.c b/vpx_dsp/arm/highbd_loopfilter_neon.c
index 5530c6425b..8d6e8acc4c 100644
--- a/vpx_dsp/arm/highbd_loopfilter_neon.c
+++ b/vpx_dsp/arm/highbd_loopfilter_neon.c
@@ -661,6 +661,17 @@ void vpx_highbd_lpf_vertical_8_dual_neon(
   vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
 }
 
+// Quiet warnings of the form: 'vpx_dsp/arm/highbd_loopfilter_neon.c|675 col 67|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 static void lpf_horizontal_16_kernel(uint16_t *s, int p,
                                      const uint16x8_t blimit_vec,
                                      const uint16x8_t limit_vec,
@@ -723,6 +734,10 @@ static void lpf_vertical_16_kernel(uint16_t *s, int p,
   }
 }
 
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
 void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 0000000000..c2ad34a695
--- /dev/null
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,300 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
+    const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
+    tran_low_t *dqcoeff_ptr) {
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_8_neon(
+    const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin,
+    const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift,
+    int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) {
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31);
+  const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31);
+  const int32x4_t coeff_0_abs = vabsq_s32(coeff_0);
+  const int32x4_t coeff_1_abs = vabsq_s32(coeff_1);
+
+  // Calculate 2 masks of elements outside the bin
+  const int32x4_t zbin_mask_0 =
+      vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin));
+  const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32(
+      vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1)));
+
+  // Get the rounded values
+  const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round);
+  const int32x4_t rounded_1 =
+      vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1));
+
+  // (round * (quant << 15) * 2) >> 16 == (round * quant)
+  int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant);
+  int32x4_t qcoeff_tmp_1 =
+      vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1));
+
+  // Add rounded values
+  qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0);
+  qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1);
+
+  // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift)
+  qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift);
+  qcoeff_tmp_1 =
+      vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1));
+
+  // Restore the sign bit.
+  qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign);
+  qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign);
+  qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign);
+  qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign);
+
+  // Only keep the relevant coeffs
+  *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0);
+  *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t
+highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, const int32x4_t zbin,
+                       const int32x4_t round, const int32x4_t quant,
+                       const int32x4_t quant_shift, const int32x4_t dequant) {
+  int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+  const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+  highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+                         &qcoeff_0, &qcoeff_1);
+
+  // Store the 32-bit qcoeffs
+  vst1q_s32(qcoeff_ptr, qcoeff_0);
+  vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+  // Calculate and store the dqcoeffs
+  dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+  dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+  highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+  return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *const mb_plane,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const struct ScanOrder *const scan_order) {
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  const int16_t *iscan = scan_order->iscan;
+
+  // Only the first element of each vector is DC.
+  // High half has identical elements, but we can reconstruct it from the low
+  // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+  // vector
+  int32x4_t zbin = vmovl_s16(vld1_s16(mb_plane->zbin));
+  int32x4_t round = vmovl_s16(vld1_s16(mb_plane->round));
+  // Extend the quant, quant_shift vectors to ones of 32-bit elements
+  // scale to high-half, so we can use vqdmulhq_s32
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+  int32x4_t quant_shift =
+      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 15);
+  int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+  // Process first 8 values which include a dc component.
+  {
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+    const int16x8_t qcoeff =
+        highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                               quant, quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  n_coeffs -= 8;
+
+  {
+    zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+    round = vdupq_lane_s32(vget_low_s32(round), 1);
+    quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+    quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+    dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+    do {
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+      const int16x8_t qcoeff =
+          highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                 round, quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+      n_coeffs -= 8;
+    } while (n_coeffs > 0);
+  }
+
+#if VPX_ARCH_AARCH64
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // VPX_ARCH_AARCH64
+}
+
+static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32(
+    int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) {
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+  dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+  dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round,
+    const int32x4_t quant, const int32x4_t quant_shift,
+    const int32x4_t dequant) {
+  int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+  const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+  highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+                         &qcoeff_0, &qcoeff_1);
+
+  // Store the 32-bit qcoeffs
+  vst1q_s32(qcoeff_ptr, qcoeff_0);
+  vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+  // Calculate and store the dqcoeffs
+  dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+  dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+  highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+  return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  int i;
+  const int16_t *iscan = scan_order->iscan;
+
+  // Only the first element of each vector is DC.
+  // High half has identical elements, but we can reconstruct it from the low
+  // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+  // vector
+  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1);
+  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1);
+  // Extend the quant, quant_shift vectors to ones of 32-bit elements
+  // scale to high-half, so we can use vqdmulhq_s32
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+  int32x4_t quant_shift =
+      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16);
+  int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+  // Process first 8 values which include a dc component.
+  {
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+    const int16x8_t qcoeff =
+        highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                     round, quant, quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  {
+    zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+    round = vdupq_lane_s32(vget_low_s32(round), 1);
+    quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+    quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+    dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+    for (i = 1; i < 32 * 32 / 8; ++i) {
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+      const int16x8_t qcoeff =
+          highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                       round, quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+    }
+  }
+
+#if VPX_ARCH_AARCH64
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // VPX_ARCH_AARCH64
+}
diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c
new file mode 100644
index 0000000000..a6684b0534
--- /dev/null
+++ b/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -0,0 +1,273 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+    sum[3] = vabal_u16(sum[3], s, r3);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+    sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+  } while (++i < h);
+
+  sum_u32[0] = vpaddlq_u16(sum[0]);
+  sum_u32[1] = vpaddlq_u16(sum[1]);
+  sum_u32[2] = vpaddlq_u16(sum[2]);
+  sum_u32[3] = vpaddlq_u16(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+                             uint32x4_t *const sad_sum) {
+  uint16x8_t abs_diff = vabdq_u16(src, ref);
+  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s0, s1;
+
+    s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4], int w,
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3;
+
+      s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+      s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+      s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+                &sum_lo[3]);
+
+      s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+                &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h);
+}
+
+static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h);
+}
+
+#define HBD_SAD_WXH_4D_NEON(w, h)                                            \
+  void vpx_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride,        \
+                              sad_array, (h));                               \
+  }
+
+HBD_SAD_WXH_4D_NEON(4, 4)
+HBD_SAD_WXH_4D_NEON(4, 8)
+
+HBD_SAD_WXH_4D_NEON(8, 4)
+HBD_SAD_WXH_4D_NEON(8, 8)
+HBD_SAD_WXH_4D_NEON(8, 16)
+
+HBD_SAD_WXH_4D_NEON(16, 8)
+HBD_SAD_WXH_4D_NEON(16, 16)
+HBD_SAD_WXH_4D_NEON(16, 32)
+
+HBD_SAD_WXH_4D_NEON(32, 16)
+HBD_SAD_WXH_4D_NEON(32, 32)
+HBD_SAD_WXH_4D_NEON(32, 64)
+
+HBD_SAD_WXH_4D_NEON(64, 32)
+HBD_SAD_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_WXH_4D_NEON
+
+#define HBD_SAD_SKIP_WXH_4D_NEON(w, h)                                        \
+  void vpx_highbd_sad_skip_##w##x##h##x4d_neon(                               \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
+      int ref_stride, uint32_t sad_array[4]) {                                \
+    highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+                              sad_array, ((h) >> 1));                         \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
+  }
+
+HBD_SAD_SKIP_WXH_4D_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_4D_NEON
diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 0000000000..b99bac66cd
--- /dev/null
+++ b/vpx_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,408 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    sum = vabal_u16(sum, s, r);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    sum = vabaq_u16(sum, s, r);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint16x8_t diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    diff0 = vabdq_u16(s0, r0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    diff1 = vabdq_u16(s1, r1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int w, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3;
+      uint16x8_t diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      diff0 = vabdq_u16(s0, r0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      diff1 = vabdq_u16(s1, r1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      diff2 = vabdq_u16(s2, r2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      diff3 = vabdq_u16(s3, r3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h) {
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h) {
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+#define HBD_SAD_WXH_NEON(w, h)                                            \
+  unsigned int vpx_highbd_sad##w##x##h##_neon(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,             \
+      int ref_stride) {                                                   \
+    return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+HBD_SAD_WXH_NEON(4, 4)
+HBD_SAD_WXH_NEON(4, 8)
+
+HBD_SAD_WXH_NEON(8, 4)
+HBD_SAD_WXH_NEON(8, 8)
+HBD_SAD_WXH_NEON(8, 16)
+
+HBD_SAD_WXH_NEON(16, 8)
+HBD_SAD_WXH_NEON(16, 16)
+HBD_SAD_WXH_NEON(16, 32)
+
+HBD_SAD_WXH_NEON(32, 16)
+HBD_SAD_WXH_NEON(32, 32)
+HBD_SAD_WXH_NEON(32, 64)
+
+HBD_SAD_WXH_NEON(64, 32)
+HBD_SAD_WXH_NEON(64, 64)
+
+#undef HBD_SAD_WXH_NEON
+
+#define HBD_SAD_SKIP_WXH_NEON(w, h)                             \
+  unsigned int vpx_highbd_sad_skip_##w##x##h##_neon(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,   \
+      int ref_stride) {                                         \
+    return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \
+                                      2 * ref_stride, (h) / 2); \
+  }
+
+HBD_SAD_SKIP_WXH_NEON(4, 4)
+HBD_SAD_SKIP_WXH_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_NEON(8, 4)
+HBD_SAD_SKIP_WXH_NEON(8, 8)
+HBD_SAD_SKIP_WXH_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_NEON(16, 8)
+HBD_SAD_SKIP_WXH_NEON(16, 16)
+HBD_SAD_SKIP_WXH_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_NEON(32, 16)
+HBD_SAD_SKIP_WXH_NEON(32, 32)
+HBD_SAD_SKIP_WXH_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_NEON(64, 32)
+HBD_SAD_SKIP_WXH_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_NEON
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    uint16x4_t p = vld1_u16(pred16_ptr);
+
+    uint16x4_t avg = vrhadd_u16(r, p);
+    sum = vabal_u16(sum, s, avg);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 4;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t p = vld1q_u16(pred16_ptr);
+
+    uint16x8_t avg = vrhaddq_u16(r, p);
+    uint16x8_t diff = vabdq_u16(s, avg);
+    sum = vpadalq_u16(sum, diff);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 8;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1, p0, p1;
+    uint16x8_t avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    p0 = vld1q_u16(pred16_ptr);
+    avg0 = vrhaddq_u16(r0, p0);
+    diff0 = vabdq_u16(s0, avg0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    p1 = vld1q_u16(pred16_ptr + 8);
+    avg1 = vrhaddq_u16(r1, p1);
+    diff1 = vabdq_u16(s1, avg1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+      uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      p0 = vld1q_u16(pred16_ptr + j);
+      avg0 = vrhaddq_u16(r0, p0);
+      diff0 = vabdq_u16(s0, avg0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      p1 = vld1q_u16(pred16_ptr + j + 8);
+      avg1 = vrhaddq_u16(r1, p1);
+      diff1 = vabdq_u16(s1, avg1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      p2 = vld1q_u16(pred16_ptr + j + 16);
+      avg2 = vrhaddq_u16(r2, p2);
+      diff2 = vabdq_u16(s2, avg2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      p3 = vld1q_u16(pred16_ptr + j + 24);
+      avg3 = vrhaddq_u16(r3, p3);
+      diff3 = vabdq_u16(s3, avg3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += w;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                                second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                                second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
+  uint32_t vpx_highbd_sad##w##x##h##_avg_neon(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
+                                      second_pred);                           \
+  }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 0000000000..91dfebf900
--- /dev/null
+++ b/vpx_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,238 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <stdint.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+                                            const uint16_t *ref,
+                                            uint32x4_t *sse_acc0,
+                                            uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+                                       uint32x4_t *sse_acc0,
+                                       uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[4];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint32x4_t sse[2];
+  highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  // Peel the first loop iteration.
+  uint16x4_t s = vld1_u16(src);
+  uint16x4_t r = vld1_u16(ref);
+
+  uint16x4_t abs_diff = vabd_u16(s, r);
+  uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    s = vld1_u16(src);
+    r = vld1_u16(ref);
+
+    abs_diff = vabd_u16(s, r);
+    sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int width, int height) {
+  // { 0, 1, 2, 3, 4, 5, 6, 7 }
+  uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+  uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+  uint64_t sse = 0;
+
+  do {
+    int w = width;
+    int offset = 0;
+
+    do {
+      uint16x8_t s = vld1q_u16(src + offset);
+      uint16x8_t r = vld1q_u16(ref + offset);
+      uint16x8_t abs_diff;
+      uint16x4_t abs_diff_lo;
+      uint16x4_t abs_diff_hi;
+      uint32x4_t sse_u32;
+
+      if (w < 8) {
+        // Mask out-of-range elements.
+        s = vandq_u16(s, remainder_mask);
+        r = vandq_u16(r, remainder_mask);
+      }
+
+      abs_diff = vabdq_u16(s, r);
+      abs_diff_lo = vget_low_u16(abs_diff);
+      abs_diff_hi = vget_high_u16(abs_diff);
+
+      sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+      sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+      sse += horizontal_long_add_uint32x4(sse_u32);
+
+      offset += 8;
+      w -= 8;
+    } while (w > 0);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  return sse;
+}
+
+int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride,
+                            const uint8_t *ref8, int ref_stride, int width,
+                            int height) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  switch (width) {
+    case 4:
+      return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+                                 height);
+  }
+}
diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000000..683df5797a
--- /dev/null
+++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,586 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
+//  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, blend);
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+  } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+                                                uint16_t *dst_ptr,
+                                                int src_stride, int pixel_step,
+                                                int dst_width, int dst_height,
+                                                int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, blend);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      8, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      16, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      32, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+                                          uint16_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+                                      h);                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
+                                           src_stride, h, yoffset);            \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 8, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 16, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 32, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 64, dst_height,
+                                               filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+      uint16x8_t p = vld1q_u16(second_pred);
+      avg = vrhaddq_u16(avg, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                            int src_stride, int dst_width, int dst_height,
+                            const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = vrhaddq_u16(s, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                      \
+  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t tmp0[w * (h + 1)];                                               \
+    uint16_t tmp1[w * h];                                                     \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                             \
+                                                                              \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+                                       xoffset);                              \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                              \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));      \
+                                                                              \
+    return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(                  \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                   \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
+                        CONVERT_TO_SHORTPTR(second_pred));                     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else if (yoffset == 4) {                                               \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp, source_stride, source_stride, w, h,                  \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp0, source_stride, 1, w, h,                             \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 0000000000..309ae7fd35
--- /dev/null
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,436 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            uint64_t *sse, int64_t *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+
+  int i = h;
+  do {
+    const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+    const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+    int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_int32x4(sse_s32);
+}
+
+// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
+// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696
+// for a 64x64 block).
+static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+                                              int src_stride,
+                                              const uint16_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint16x8_t s = vld1q_u16(src_ptr + j);
+      const uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+      sum_s32 = vpadalq_s16(sum_s32, diff);
+
+      sse_s32[0] =
+          vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+      sse_s32[1] =
+          vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = horizontal_long_add_uint32x4(vaddq_u32(
+      vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
+}
+
+static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
+}
+
+static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
+}
+
+static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+// For 12-bit data, we can only accumulate up to 128 elements in the sum of
+// squares (4095*4095*128 = 2146435200), and because we're using two int32x4
+// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128)
+// or 16 64-element rows before we have to accumulate into 64-bit elements.
+// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different
+// helper function.
+
+// Process a block of any size where the width is divisible by 8, with
+// accumulation into 64-bit elements.
+static INLINE void highbd_variance_xlarge_neon(
+    const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
+    int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+  // accumulator overflows. After hitting this limit we accumulate into 64-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t s0 = vld1q_u16(src_ptr + j);
+        const uint16x8_t r0 = vld1q_u16(ref_ptr + j);
+
+        const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+        sum_s32 = vpadalq_s16(sum_s32, diff);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+        j += 8;
+      } while (j < w);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]);
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]);
+    h_tmp += h_limit;
+  } while (i < h);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = (uint64_t)horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_32xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
+                              sum);
+}
+
+static INLINE void highbd_variance_64xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
+                              sum);
+}
+
+#define HBD_VARIANCE_WXH_8_NEON(w, h)                                 \
+  uint32_t vpx_highbd_8_variance##w##x##h##_neon(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    sum = (int)sum_long;                                              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h));         \
+  }
+
+#define HBD_VARIANCE_WXH_10_NEON(w, h)                                \
+  uint32_t vpx_highbd_10_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+#define HBD_VARIANCE_WXH_12_NEON(w, h)                                \
+  uint32_t vpx_highbd_12_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h)                                \
+  uint32_t vpx_highbd_12_variance##w##x##h##_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride, uint32_t *sse) {                                       \
+    int sum;                                                                 \
+    int64_t var;                                                             \
+    uint64_t sse_long = 0;                                                   \
+    int64_t sum_long = 0;                                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                            \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
+    highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \
+                                        &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                        \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                              \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                   \
+  }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 4)
+HBD_VARIANCE_WXH_8_NEON(4, 8)
+
+HBD_VARIANCE_WXH_8_NEON(8, 4)
+HBD_VARIANCE_WXH_8_NEON(8, 8)
+HBD_VARIANCE_WXH_8_NEON(8, 16)
+
+HBD_VARIANCE_WXH_8_NEON(16, 8)
+HBD_VARIANCE_WXH_8_NEON(16, 16)
+HBD_VARIANCE_WXH_8_NEON(16, 32)
+
+HBD_VARIANCE_WXH_8_NEON(32, 16)
+HBD_VARIANCE_WXH_8_NEON(32, 32)
+HBD_VARIANCE_WXH_8_NEON(32, 64)
+
+HBD_VARIANCE_WXH_8_NEON(64, 32)
+HBD_VARIANCE_WXH_8_NEON(64, 64)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 4)
+HBD_VARIANCE_WXH_10_NEON(4, 8)
+
+HBD_VARIANCE_WXH_10_NEON(8, 4)
+HBD_VARIANCE_WXH_10_NEON(8, 8)
+HBD_VARIANCE_WXH_10_NEON(8, 16)
+
+HBD_VARIANCE_WXH_10_NEON(16, 8)
+HBD_VARIANCE_WXH_10_NEON(16, 16)
+HBD_VARIANCE_WXH_10_NEON(16, 32)
+
+HBD_VARIANCE_WXH_10_NEON(32, 16)
+HBD_VARIANCE_WXH_10_NEON(32, 32)
+HBD_VARIANCE_WXH_10_NEON(32, 64)
+
+HBD_VARIANCE_WXH_10_NEON(64, 32)
+HBD_VARIANCE_WXH_10_NEON(64, 64)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 4)
+HBD_VARIANCE_WXH_12_NEON(4, 8)
+
+HBD_VARIANCE_WXH_12_NEON(8, 4)
+HBD_VARIANCE_WXH_12_NEON(8, 8)
+HBD_VARIANCE_WXH_12_NEON(8, 16)
+
+HBD_VARIANCE_WXH_12_NEON(16, 8)
+HBD_VARIANCE_WXH_12_NEON(16, 16)
+HBD_VARIANCE_WXH_12_NEON(16, 32)
+
+HBD_VARIANCE_WXH_12_NEON(32, 16)
+HBD_VARIANCE_WXH_12_NEON(32, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
+
+#define HIGHBD_GET_VAR(S)                                             \
+  void vpx_highbd_8_get##S##x##S##var_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    *sum = (int)sum_long;                                             \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_10_get##S##x##S##var_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                      \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_12_get##S##x##S##var_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                      \
+  }
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int w, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      uint16x8_t diff = vabdq_u16(s, r);
+
+      sse_u32[0] =
+          vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff));
+      sse_u32[1] =
+          vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
+                                             int src_stride,
+                                             const uint16_t *ref_ptr,
+                                             int ref_stride, int h) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h);
+}
+
+#define HIGHBD_MSE_WXH_NEON(w, h)                                         \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon(                              \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                          \
+  }                                                                       \
+                                                                          \
+  uint32_t vpx_highbd_10_mse##w##x##h##_neon(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                   \
+    return *sse;                                                          \
+  }                                                                       \
+                                                                          \
+  uint32_t vpx_highbd_12_mse##w##x##h##_neon(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                   \
+    return *sse;                                                          \
+  }
+
+HIGHBD_MSE_WXH_NEON(16, 16)
+HIGHBD_MSE_WXH_NEON(16, 8)
+HIGHBD_MSE_WXH_NEON(8, 16)
+HIGHBD_MSE_WXH_NEON(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON
diff --git a/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/vpx_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 0000000000..1a88720172
--- /dev/null
+++ b/vpx_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+                                                    int src_stride,
+                                                    const uint16_t *ref_ptr,
+                                                    int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint16_t *ref_ptr,
+                                                     int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    s1 = vld1q_u16(src_ptr + 8);
+    r0 = vld1q_u16(ref_ptr);
+    r1 = vld1q_u16(ref_ptr + 8);
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                      \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, uint32_t *sse) {                                         \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                              \
+    *sse =                                                                     \
+        highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                               \
+  }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index c46c016312..47684473ca 100644
--- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -355,7 +355,6 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
   } else {
     const int16x8_t filters = vld1q_s16(filter[x0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    uint16x8_t t0, t1, t2, t3;
 
     assert(!((intptr_t)dst & 3));
     assert(!(dst_stride & 3));
@@ -365,6 +364,7 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
     if (h == 4) {
       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
       int32x4_t d0, d1, d2, d3;
+      uint16x8_t t0, t1, t2, t3;
       uint16x8_t d01, d23, t01, t23;
 
       __builtin_prefetch(src + 0 * src_stride);
diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index 9d2752e097..7751082083 100644
--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
   (void)bd;
 
   if (w < 8) {  // copy4
+    uint16x4_t s0, s1;
     do {
-      vst1_u16(dst, vld1_u16(src));
+      s0 = vld1_u16(src);
       src += src_stride;
-      dst += dst_stride;
-      vst1_u16(dst, vld1_u16(src));
+      s1 = vld1_u16(src);
       src += src_stride;
+
+      vst1_u16(dst, s0);
+      dst += dst_stride;
+      vst1_u16(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // copy8
+    uint16x8_t s0, s1;
     do {
-      vst1q_u16(dst, vld1q_u16(src));
+      s0 = vld1q_u16(src);
       src += src_stride;
-      dst += dst_stride;
-      vst1q_u16(dst, vld1q_u16(src));
+      s1 = vld1q_u16(src);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      dst += dst_stride;
+      vst1q_u16(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // copy16
+    uint16x8_t s0, s1, s2, s3;
     do {
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
       src += src_stride;
-      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
+      s2 = vld1q_u16(src);
+      s3 = vld1q_u16(src + 8);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
       dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
+      vst1q_u16(dst, s2);
+      vst1q_u16(dst + 8, s3);
       dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+      h -= 2;
+    } while (h != 0);
   } else if (w == 32) {  // copy32
+    uint16x8_t s0, s1, s2, s3;
     do {
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
       dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+    } while (--h != 0);
   } else {  // copy64
+    uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
     do {
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      s4 = vld1q_u16(src + 32);
+      s5 = vld1q_u16(src + 40);
+      s6 = vld1q_u16(src + 48);
+      s7 = vld1q_u16(src + 56);
       src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
+      vst1q_u16(dst + 32, s4);
+      vst1q_u16(dst + 40, s5);
+      vst1q_u16(dst + 48, s6);
+      vst1q_u16(dst + 56, s7);
+      dst += dst_stride;
+    } while (--h != 0);
   }
 }
diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm
index 184d218941..175ba7fbc2 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -17,7 +17,7 @@
 
     INCLUDE vpx_dsp/arm/idct_neon.asm.S
 
-    AREA     Block, CODE, READONLY ; name this block of code
+    AREA     Block, CODE, READONLY
 ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 38e275834b..4f909e4935 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -12,51 +12,47 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
 // DC 4x4
 
-static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) {
-  const uint8x8_t ref_u8 = vld1_u8(ref);
-  const uint16x4_t p0 = vpaddl_u8(ref_u8);
-  return vpadd_u16(p0, p0);
+static INLINE uint16_t dc_sum_4(const uint8_t *ref) {
+  return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref));
 }
 
 static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
                                 const uint8x8_t dc) {
-  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0);
   }
 }
 
 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t a = vld1_u8(above);
-  const uint8x8_t l = vld1_u8(left);
-  const uint16x8_t al = vaddl_u8(a, l);
-  uint16x4_t sum;
-  uint8x8_t dc;
-  sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al));
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint8x8_t a = load_unaligned_u8_4x1(above);
+  const uint8x8_t l = load_unaligned_u8_4x1(left);
+  const uint16x4_t al = vget_low_u16(vaddl_u8(a, l));
+  const uint16_t sum = horizontal_add_uint16x4(al);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_4(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
+  const uint16_t sum = dc_sum_4(left);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
   (void)above;
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_4(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
+  const uint16_t sum = dc_sum_4(above);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
   (void)left;
   dc_store_4x4(dst, stride, dc);
 }
@@ -72,19 +68,15 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 8x8
 
-static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) {
-  const uint8x8_t ref_u8 = vld1_u8(ref);
-  uint16x4_t sum = vpaddl_u8(ref_u8);
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_8(const uint8_t *ref) {
+  return horizontal_add_uint8x8(vld1_u8(ref));
 }
 
 static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
                                 const uint8x8_t dc) {
-  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
   int i;
   for (i = 0; i < 8; ++i, dst += stride) {
-    vst1_u8(dst, dc_dup);
+    vst1_u8(dst, dc);
   }
 }
 
@@ -92,28 +84,24 @@ void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   const uint8x8_t above_u8 = vld1_u8(above);
   const uint8x8_t left_u8 = vld1_u8(left);
-  const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8);
-  const uint16x8_t p0 = vpaddlq_u8(above_and_left);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16x8_t al = vaddl_u8(above_u8, left_u8);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4);
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_8(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint16_t sum = dc_sum_8(left);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   (void)above;
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_8(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint16_t sum = dc_sum_8(above);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   (void)left;
   dc_store_8x8(dst, stride, dc);
 }
@@ -129,20 +117,15 @@ void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) {
-  const uint8x16_t ref_u8 = vld1q_u8(ref);
-  const uint16x8_t p0 = vpaddlq_u8(ref_u8);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_16(const uint8_t *ref) {
+  return horizontal_add_uint8x16(vld1q_u8(ref));
 }
 
 static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8x8_t dc) {
-  const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
+                                  const uint8x16_t dc) {
   int i;
   for (i = 0; i < 16; ++i, dst += stride) {
-    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 0, dc);
   }
 }
 
@@ -150,22 +133,19 @@ void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x16_t ref0 = vld1q_u8(above);
   const uint8x16_t ref1 = vld1q_u8(left);
-  const uint16x8_t p0 = vpaddlq_u8(ref0);
-  const uint16x8_t p1 = vpaddlq_u8(ref1);
-  const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16x8_t a = vpaddlq_u8(ref0);
+  const uint16x8_t l = vpaddlq_u8(ref1);
+  const uint16x8_t al = vaddq_u16(a, l);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_16(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16_t sum = dc_sum_16(left);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
   (void)above;
   dc_store_16x16(dst, stride, dc);
 }
@@ -173,8 +153,8 @@ void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_16(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16_t sum = dc_sum_16(above);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
   (void)left;
   dc_store_16x16(dst, stride, dc);
 }
@@ -182,7 +162,7 @@ void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc = vdup_n_u8(0x80);
+  const uint8x16_t dc = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
   dc_store_16x16(dst, stride, dc);
@@ -191,51 +171,41 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
-  const uint8x16x2_t r = vld2q_u8(ref);
-  const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
-  const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
-  const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_32(const uint8_t *ref) {
+  const uint8x16_t r0 = vld1q_u8(ref + 0);
+  const uint8x16_t r1 = vld1q_u8(ref + 16);
+  const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1));
+  return horizontal_add_uint16x8(r01);
 }
 
 static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8x8_t dc) {
-  uint8x16x2_t dc_dup;
+                                  const uint8x16_t dc) {
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
-
   for (i = 0; i < 32; ++i, dst += stride) {
-    vst2q_u8(dst, dc_dup);
+    vst1q_u8(dst + 0, dc);
+    vst1q_u8(dst + 16, dc);
   }
 }
 
 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x16x2_t a = vld2q_u8(above);
-  const uint8x16x2_t l = vld2q_u8(left);
-  const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
-  const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
-  const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
-  const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
-  const uint16x8_t pa = vaddq_u16(pa0, pa1);
-  const uint16x8_t pl = vaddq_u16(pl0, pl1);
-  const uint16x8_t pal = vaddq_u16(pa, pl);
-  uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6));
+  const uint8x16_t a0 = vld1q_u8(above + 0);
+  const uint8x16_t a1 = vld1q_u8(above + 16);
+  const uint8x16_t l0 = vld1q_u8(left + 0);
+  const uint8x16_t l1 = vld1q_u8(left + 16);
+  const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1));
+  const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1));
+  const uint16x8_t al = vaddq_u16(a01, l01);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0);
   dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_32(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16_t sum = dc_sum_32(left);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   (void)above;
   dc_store_32x32(dst, stride, dc);
 }
@@ -243,8 +213,8 @@ void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_32(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16_t sum = dc_sum_32(above);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   (void)left;
   dc_store_32x32(dst, stride, dc);
 }
@@ -252,7 +222,7 @@ void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc = vdup_n_u8(0x80);
+  const uint8x16_t dc = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
   dc_store_32x32(dst, stride, dc);
@@ -262,123 +232,629 @@ void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t ABCDEFGH = vld1_u8(above);
-  const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8);
-  const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16);
-  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
-  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
-  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  uint8x8_t a0, a1, a2, d0;
+  uint8_t a7;
   (void)left;
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-  vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7);
-}
 
-static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride,
-                               const uint8x8_t above_right, uint8x8_t *row) {
-  *row = vext_u8(*row, above_right, 1);
-  vst1_u8(*dst, *row);
-  *dst += stride;
+  a0 = vld1_u8(above);
+  a7 = above[7];
+
+  // [ above[1], ..., above[6], x, x ]
+  a1 = vext_u8(a0, a0, 1);
+  // [ above[2], ..., above[7], x, x ]
+  a2 = vext_u8(a0, a0, 2);
+
+  // d0[0] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[5] = AVG3(above[5], above[6], above[7]);
+  // d0[6] = x (don't care)
+  // d0[7] = x (don't care)
+  d0 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+  // We want:
+  // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
+  // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
+  // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
+  // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1));
+  store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2));
+  store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3));
+
+  // We stored d0[6] above, so fixup into above[7].
+  dst[3 * stride + 3] = a7;
 }
 
 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t A0 = vld1_u8(above);
-  const uint8x8_t above_right = vdup_lane_u8(A0, 7);
-  const uint8x8_t A1 = vext_u8(A0, above_right, 1);
-  const uint8x8_t A2 = vext_u8(A0, above_right, 2);
-  const uint8x8_t avg1 = vhadd_u8(A0, A2);
-  uint8x8_t row = vrhadd_u8(avg1, A1);
+  uint8x8_t ax0, a0, a1, a7, d0;
   (void)left;
 
-  vst1_u8(dst, row);
-  dst += stride;
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  vst1_u8(dst, above_right);
-}
-
-static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride,
-                                const uint8x16_t above_right, uint8x16_t *row) {
-  *row = vextq_u8(*row, above_right, 1);
-  vst1q_u8(*dst, *row);
-  *dst += stride;
+  a0 = vld1_u8(above + 0);
+  a1 = vld1_u8(above + 1);
+  a7 = vld1_dup_u8(above + 7);
+
+  // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+  // shift in above[7] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vext_u8(a0, a0, 7);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[7] = AVG3(above[6], above[7], above[8]);
+  d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[7].
+  vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1));
+  vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2));
+  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3));
+  vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4));
+  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5));
+  vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6));
+  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7));
+  vst1_u8(dst + 7 * stride, a7);
 }
 
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0 = vld1q_u8(above);
-  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7);
-  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
-  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
-  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
-  uint8x16_t row = vrhaddq_u8(avg1, A1);
+  uint8x16_t ax0, a0, a1, a15, d0;
   (void)left;
 
-  vst1q_u8(dst, row);
-  dst += stride;
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  vst1q_u8(dst, above_right);
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a15 = vld1q_dup_u8(above + 15);
+
+  // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+  // shift in above[15] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[14] ]
+  ax0 = vextq_u8(a0, a0, 15);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[15] = AVG3(above[14], above[15], above[16]);
+  d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[15].
+  vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1));
+  vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2));
+  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15));
+  vst1q_u8(dst + 15 * stride, a15);
 }
 
 void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0_0 = vld1q_u8(above);
-  const uint8x16_t A0_1 = vld1q_u8(above + 16);
-  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7);
-  const uint8x16_t A1_0 = vld1q_u8(above + 1);
-  const uint8x16_t A1_1 = vld1q_u8(above + 17);
-  const uint8x16_t A2_0 = vld1q_u8(above + 2);
-  const uint8x16_t A2_1 = vld1q_u8(above + 18);
-  const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0);
-  const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1);
-  uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0);
-  uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1);
-  int i;
+  uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2];
   (void)left;
 
-  vst1q_u8(dst, row_0);
-  dst += 16;
-  vst1q_u8(dst, row_1);
-  dst += stride - 16;
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  a17 = vld1q_u8(above + 17);
+  a31 = vld1q_dup_u8(above + 31);
+
+  // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+  // shift in above[15] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[14] ]
+  ax0 = vextq_u8(a0, a0, 15);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[15] = AVG3(above[14], above[15], above[16]);
+  d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+  d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[15].
+  vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1));
+  vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1));
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15));
+  vst1q_u8(dst + 15 * stride + 0, d0[1]);
+  vst1q_u8(dst + 15 * stride + 16, a31);
+
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1));
+  vst1q_u8(dst + 16 * stride + 16, a31);
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2));
+  vst1q_u8(dst + 17 * stride + 16, a31);
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3));
+  vst1q_u8(dst + 18 * stride + 16, a31);
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4));
+  vst1q_u8(dst + 19 * stride + 16, a31);
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5));
+  vst1q_u8(dst + 20 * stride + 16, a31);
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6));
+  vst1q_u8(dst + 21 * stride + 16, a31);
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7));
+  vst1q_u8(dst + 22 * stride + 16, a31);
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8));
+  vst1q_u8(dst + 23 * stride + 16, a31);
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9));
+  vst1q_u8(dst + 24 * stride + 16, a31);
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10));
+  vst1q_u8(dst + 25 * stride + 16, a31);
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11));
+  vst1q_u8(dst + 26 * stride + 16, a31);
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12));
+  vst1q_u8(dst + 27 * stride + 16, a31);
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13));
+  vst1q_u8(dst + 28 * stride + 16, a31);
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14));
+  vst1q_u8(dst + 29 * stride + 16, a31);
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15));
+  vst1q_u8(dst + 30 * stride + 16, a31);
+  vst1q_u8(dst + 31 * stride + 0, a31);
+  vst1q_u8(dst + 31 * stride + 16, a31);
+}
 
-  for (i = 0; i < 30; ++i) {
-    row_0 = vextq_u8(row_0, row_1, 1);
-    row_1 = vextq_u8(row_1, above_right, 1);
-    vst1q_u8(dst, row_0);
-    dst += 16;
-    vst1q_u8(dst, row_1);
-    dst += stride - 16;
-  }
+// -----------------------------------------------------------------------------
+
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3;
+  (void)left;
+
+  a0 = load_unaligned_u8_4x1(above + 0);
+  a1 = load_unaligned_u8_4x1(above + 1);
+  a2 = load_unaligned_u8_4x1(above + 2);
+  a3 = load_unaligned_u8_4x1(above + 3);
+
+  d0 = vrhadd_u8(a0, a1);
+  d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+  d2 = vrhadd_u8(a1, a2);
+  d3 = vrhadd_u8(vhadd_u8(a1, a3), a2);
+
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1(dst + 2 * stride, d2);
+  store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a0, a1, a2, a7, d0, d1;
+  (void)left;
+
+  a0 = vld1_u8(above + 0);
+  a1 = vld1_u8(above + 1);
+  a2 = vld1_u8(above + 2);
+  a7 = vld1_dup_u8(above + 7);
+
+  d0 = vrhadd_u8(a0, a1);
+  d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+  vst1_u8(dst + 0 * stride, d0);
+  vst1_u8(dst + 1 * stride, d1);
+
+  d0 = vext_u8(d0, d0, 7);
+  d1 = vext_u8(d1, d1, 7);
 
-  vst1q_u8(dst, above_right);
-  dst += 16;
-  vst1q_u8(dst, row_1);
+  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2));
+  vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2));
+  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3));
+  vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3));
+  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4));
+  vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4));
+}
+
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a0, a1, a2, a15, d0, d1;
+  (void)left;
+
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a2 = vld1q_u8(above + 2);
+  a15 = vld1q_dup_u8(above + 15);
+
+  d0 = vrhaddq_u8(a0, a1);
+  d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+
+  vst1q_u8(dst + 0 * stride, d0);
+  vst1q_u8(dst + 1 * stride, d1);
+
+  d0 = vextq_u8(d0, d0, 15);
+  d1 = vextq_u8(d1, d1, 15);
+
+  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8));
+  vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8));
+}
+
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi;
+  (void)left;
+
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a2 = vld1q_u8(above + 2);
+  a16 = vld1q_u8(above + 16);
+  a17 = vld1q_u8(above + 17);
+  a18 = vld1q_u8(above + 18);
+  a31 = vld1q_dup_u8(above + 31);
+
+  d0_lo = vrhaddq_u8(a0, a1);
+  d0_hi = vrhaddq_u8(a16, a17);
+  d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17);
+
+  vst1q_u8(dst + 0 * stride + 0, d0_lo);
+  vst1q_u8(dst + 0 * stride + 16, d0_hi);
+  vst1q_u8(dst + 1 * stride + 0, d1_lo);
+  vst1q_u8(dst + 1 * stride + 16, d1_hi);
+
+  d0_hi = vextq_u8(d0_lo, d0_hi, 15);
+  d0_lo = vextq_u8(d0_lo, d0_lo, 15);
+  d1_hi = vextq_u8(d1_lo, d1_hi, 15);
+  d1_lo = vextq_u8(d1_lo, d1_lo, 15);
+
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15));
+  vst1q_u8(dst + 30 * stride + 0, d0_hi);
+  vst1q_u8(dst + 30 * stride + 16, a31);
+  vst1q_u8(dst + 31 * stride + 0, d1_hi);
+  vst1q_u8(dst + 31 * stride + 16, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
+
+  az = load_unaligned_u8_4x1(above - 1);
+  a0 = load_unaligned_u8_4x1(above + 0);
+  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
+  col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
+
+  d0 = vrhadd_u8(az, a0);
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+  d2 = vext_u8(col0, d0, 7);
+  d3 = vext_u8(col1, d1, 7);
+
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1(dst + 2 * stride, d2);
+  store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1_u8(above - 1);
+  a0 = vld1_u8(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = vld1_u8(left + 0);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vext_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], above[0])
+  // d0[1] = AVG2(above[0], above[1])
+  // ...
+  // d0[7] = AVG2(above[6], above[7])
+  d0 = vrhadd_u8(az, a0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vector to put the elements to be shifted in
+  // at the end. The lowest two lanes here are unused:
+  // col0[7] = AVG3(above[-1], left[0], left[1])
+  // col0[6] = AVG3(left[0], left[1], left[2])
+  // ...
+  // col0[2] = AVG3(left[4], left[5], left[6])
+  // col0[1] = x (don't care)
+  // col0[0] = x (don't care)
+  col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
+
+  // We don't care about the first parameter to this uzp since we only ever use
+  // the high three elements, we just use col0 again since it is already
+  // available:
+  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+  col0_even = vuzp_u8(col0, col0).val[1];
+  col0_odd = vuzp_u8(col0, col0).val[0];
+
+  // Incrementally shift more elements from col0 into d0/1:
+  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
+  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
+  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
+  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  vst1_u8(dst + 0 * stride, d0);
+  vst1_u8(dst + 1 * stride, d1);
+  vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
+  vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
+  vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
+  vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
+  vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
+  vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
+}
+
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[15], x ]
+  l1 = vextq_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0 = vrhaddq_u8(az, a0);
+  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+
+  col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
+
+  // The low nine lanes here are unused so the first input to the uzp is
+  // unused, so just use a duplicate of col0 since we have it already. This
+  // also means that the lowest lane of col0 here is unused.
+  col0_even = vuzpq_u8(col0, col0).val[1];
+  col0_odd = vuzpq_u8(col0, col0).val[0];
+
+  vst1q_u8(dst + 0 * stride, d0);
+  vst1q_u8(dst + 1 * stride, d1);
+  vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
+  vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
+  vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
+  vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
+  vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
+  vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
+  vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
+  vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
+  vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
+  vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
+  vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
+  vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
+  vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
+}
+
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
+      l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  a14 = vld1q_u8(above + 14);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  l1 = vld1q_u8(left + 1);
+  l15 = vld1q_u8(left + 15);
+  l16 = vld1q_u8(left + 16);
+  // The last lane here is unused, reading left[32] would cause a buffer
+  // over-read (observed as an address-sanitizer failure), so just fill with a
+  // duplicate of left[16] to avoid needing to materialize a zero:
+  // [ left[17], ... , left[31], x ]
+  l17 = vextq_u8(l16, l16, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0_lo = vrhaddq_u8(az, a0);
+  d0_hi = vrhaddq_u8(a15, a16);
+  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+  // The last lane of col0_hi is unused here.
+  col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+  col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
+  col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
+
+  // The first lane of these are unused since they are only ever called as
+  // ext(col0, _, i) where i >= 1.
+  col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
+  col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
+
+  vst1q_u8(dst + 0 * stride + 0, d0_lo);
+  vst1q_u8(dst + 0 * stride + 16, d0_hi);
+  vst1q_u8(dst + 1 * stride + 0, d1_lo);
+  vst1q_u8(dst + 1 * stride + 16, d1_hi);
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
 }
 
 // -----------------------------------------------------------------------------
@@ -390,22 +866,14 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x8_t L3210 = vrev64_u8(L0123);
   const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
   const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
-  const uint8x8_t L10XA0123_ =
-      vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8));
+  const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1);
   const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
   const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)dst, r0, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r1, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r2, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r3, 0);
+
+  store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3));
+  store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2));
+  store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1));
+  store_u8_4x1(dst + 3 * stride, avg2);
 }
 
 void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -422,31 +890,15 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
   const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
   const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
-  const uint8x8_t row_0 = vget_low_u8(row);
-  const uint8x8_t row_1 = vget_high_u8(row);
-  const uint8x8_t r0 = vext_u8(row_0, row_1, 7);
-  const uint8x8_t r1 = vext_u8(row_0, row_1, 6);
-  const uint8x8_t r2 = vext_u8(row_0, row_1, 5);
-  const uint8x8_t r3 = vext_u8(row_0, row_1, 4);
-  const uint8x8_t r4 = vext_u8(row_0, row_1, 3);
-  const uint8x8_t r5 = vext_u8(row_0, row_1, 2);
-  const uint8x8_t r6 = vext_u8(row_0, row_1, 1);
-
-  vst1_u8(dst, r0);
-  dst += stride;
-  vst1_u8(dst, r1);
-  dst += stride;
-  vst1_u8(dst, r2);
-  dst += stride;
-  vst1_u8(dst, r3);
-  dst += stride;
-  vst1_u8(dst, r4);
-  dst += stride;
-  vst1_u8(dst, r5);
-  dst += stride;
-  vst1_u8(dst, r6);
-  dst += stride;
-  vst1_u8(dst, row_0);
+
+  vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7)));
+  vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6)));
+  vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5)));
+  vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4)));
+  vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3)));
+  vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2)));
+  vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1)));
+  vst1_u8(dst + 7 * stride, vget_low_u8(row));
 }
 
 static INLINE void d135_store_16x8(
@@ -489,6 +941,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
   const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
   const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
+
   const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
   const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
   const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
@@ -496,7 +949,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
   const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
   const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
-  const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1));
+  const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8);
   const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
   const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
   const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
@@ -667,6 +1120,454 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
   d135_store_32x2(&dst, stride, row_0, row_1, row_2);
 }
 
+// -----------------------------------------------------------------------------
+
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02;
+
+  az = load_unaligned_u8_4x1(above - 1);
+  a0 = load_unaligned_u8_4x1(above + 0);
+  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = load_unaligned_u8_4x1(left + 0);
+  l1 = load_unaligned_u8_4x1(left + 1);
+  // [ above[-1], left[0], left[1], left[2], x, x, x, x ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  d0 = vrhadd_u8(azl0, l0);
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+  d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+  d02 = vrev64_u8(vzip_u8(d0, d2).val[0]);
+
+  store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7));
+  store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5));
+  store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3));
+  store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1));
+}
+
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+  az = vld1_u8(above - 1);
+  a0 = vld1_u8(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = vld1_u8(left);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vext_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], left[0])
+  // d0[1] = AVG2(left[0], left[1])
+  // ...
+  // d0[7] = AVG2(left[6], left[7])
+  d0 = vrhadd_u8(azl0, l0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+  // d2[0] = AVG3(above[-1], left[0], left[1])
+  // d2[1] = AVG3(left[0], left[1], left[2])
+  // ...
+  // d2[6] = AVG3(left[5], left[6], left[7])
+  // d2[7] = x (don't care)
+  d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vectors to put the elements to be shifted
+  // in at the end. The lowest lane of d02_lo is unused.
+  d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0];
+  d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+  // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+  // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+  // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+  // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+  vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7));
+  vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5));
+  vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3));
+  vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1));
+  vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7));
+  vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5));
+  vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3));
+  vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[15], x ]
+  l1 = vextq_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0 = vrhaddq_u8(azl0, l0);
+  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+
+  d0 = vrev64q_u8(vextq_u8(d0, d0, 8));
+  d2 = vrev64q_u8(vextq_u8(d2, d2, 8));
+
+  // The lowest lane of d02_lo is unused.
+  d02_lo = vzipq_u8(d2, d0).val[0];
+  d02_hi = vzipq_u8(d2, d0).val[1];
+
+  vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15));
+  vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13));
+  vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3));
+  vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo,
+      d0_hi, d1_lo, d1_hi, d2_lo, d2_hi;
+  uint8x16x2_t d02_hi, d02_lo;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  a14 = vld1q_u8(above + 14);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left);
+  l1 = vld1q_u8(left + 1);
+  l15 = vld1q_u8(left + 15);
+  l16 = vld1q_u8(left + 16);
+  // The last lane here is unused, reading left[32] would cause a buffer
+  // over-read (observed as an address-sanitizer failure), so just fill with a
+  // duplicate of left[16] to avoid needing to materialize a zero:
+  // [ left[17], ... , left[31], x ]
+  l17 = vextq_u8(l16, l16, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0_lo = vrhaddq_u8(azl0, l0);
+  d0_hi = vrhaddq_u8(l15, l16);
+
+  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+  // The highest lane of d2_hi is unused.
+  d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+  d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8));
+  d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8));
+
+  d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8));
+  d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8));
+
+  // d02_hi.val[0][0] is unused here.
+  d02_hi = vzipq_u8(d2_hi, d0_hi);
+  d02_lo = vzipq_u8(d2_lo, d0_lo);
+
+  vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15));
+  vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1;
+  (void)above;
+
+  // We need the low half lanes here for the c0/c1 arithmetic but the high half
+  // lanes for the ext:
+  // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ]
+  l0 = load_replicate_u8_4x1(left + 0);
+  l3 = vld1_dup_u8(left + 3);
+
+  // [ left[1], left[2], left[3], left[3], x, x, x, x ]
+  l1 = vext_u8(l0, l3, 5);
+  // [ left[2], left[3], left[3], left[3], x, x, x, x ]
+  l2 = vext_u8(l0, l3, 6);
+
+  c0 = vrhadd_u8(l0, l1);
+  c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+  // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ]
+  c01 = vzip_u8(c0, c1).val[0];
+
+  d0 = c01;
+  d1 = vext_u8(c01, l3, 2);
+
+  // Store the high half of the vector for stride={2,3} to avoid needing
+  // additional ext instructions:
+  // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
+  // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
+  // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
+  // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1_high(dst + 2 * stride, d0);
+  store_u8_4x1_high(dst + 3 * stride, d1);
+}
+
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+  (void)above;
+
+  l0 = vld1_u8(left + 0);
+  l7 = vld1_dup_u8(left + 7);
+
+  // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+  l1 = vext_u8(l0, l7, 1);
+  // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+  l2 = vext_u8(l0, l7, 2);
+
+  c0 = vrhadd_u8(l0, l1);
+  c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+  c01_lo = vzip_u8(c0, c1).val[0];
+  c01_hi = vzip_u8(c0, c1).val[1];
+
+  vst1_u8(dst + 0 * stride, c01_lo);
+  vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2));
+  vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4));
+  vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6));
+  vst1_u8(dst + 4 * stride, c01_hi);
+  vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2));
+  vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4));
+  vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6));
+}
+
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+  (void)above;
+
+  l0 = vld1q_u8(left + 0);
+  l15 = vld1q_dup_u8(left + 15);
+
+  l1 = vextq_u8(l0, l15, 1);
+  l2 = vextq_u8(l0, l15, 2);
+
+  c0 = vrhaddq_u8(l0, l1);
+  c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1);
+
+  c01_lo = vzipq_u8(c0, c1).val[0];
+  c01_hi = vzipq_u8(c0, c1).val[1];
+
+  vst1q_u8(dst + 0 * stride, c01_lo);
+  vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2));
+  vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4));
+  vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6));
+  vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8));
+  vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10));
+  vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12));
+  vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14));
+  vst1q_u8(dst + 8 * stride, c01_hi);
+  vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2));
+  vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4));
+  vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6));
+  vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8));
+  vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10));
+  vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12));
+  vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14));
+}
+
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo,
+      c1_hi, c01[4];
+  (void)above;
+
+  l0_lo = vld1q_u8(left + 0);
+  l0_hi = vld1q_u8(left + 16);
+  l31 = vld1q_dup_u8(left + 31);
+
+  l1_lo = vextq_u8(l0_lo, l0_hi, 1);
+  l1_hi = vextq_u8(l0_hi, l31, 1);
+  l2_lo = vextq_u8(l0_lo, l0_hi, 2);
+  l2_hi = vextq_u8(l0_hi, l31, 2);
+
+  c0_lo = vrhaddq_u8(l0_lo, l1_lo);
+  c0_hi = vrhaddq_u8(l0_hi, l1_hi);
+  c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo);
+  c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi);
+
+  c01[0] = vzipq_u8(c0_lo, c1_lo).val[0];
+  c01[1] = vzipq_u8(c0_lo, c1_lo).val[1];
+  c01[2] = vzipq_u8(c0_hi, c1_hi).val[0];
+  c01[3] = vzipq_u8(c0_hi, c1_hi).val[1];
+
+  vst1q_u8(dst + 0 * stride + 0, c01[0]);
+  vst1q_u8(dst + 0 * stride + 16, c01[1]);
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14));
+  vst1q_u8(dst + 8 * stride + 0, c01[1]);
+  vst1q_u8(dst + 8 * stride + 16, c01[2]);
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14));
+  vst1q_u8(dst + 16 * stride + 0, c01[2]);
+  vst1q_u8(dst + 16 * stride + 16, c01[3]);
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14));
+  vst1q_u8(dst + 24 * stride + 0, c01[3]);
+  vst1q_u8(dst + 24 * stride + 16, l31);
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2));
+  vst1q_u8(dst + 25 * stride + 16, l31);
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4));
+  vst1q_u8(dst + 26 * stride + 16, l31);
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6));
+  vst1q_u8(dst + 27 * stride + 16, l31);
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8));
+  vst1q_u8(dst + 28 * stride + 16, l31);
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10));
+  vst1q_u8(dst + 29 * stride + 16, l31);
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12));
+  vst1q_u8(dst + 30 * stride + 16, l31);
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14));
+  vst1q_u8(dst + 31 * stride + 16, l31);
+}
+
+// -----------------------------------------------------------------------------
+
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index 7419cea022..c54e588239 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -975,6 +975,17 @@ FUN_LPF_16_KERNEL(_, 8)        // lpf_16_kernel
 FUN_LPF_16_KERNEL(_dual_, 16)  // lpf_16_dual_kernel
 #undef FUN_LPF_16_KERNEL
 
+// Quiet warnings of the form: 'vpx_dsp/arm/loopfilter_neon.c|981 col 42|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
   uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
@@ -1090,3 +1101,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
               vget_high_u8(oq0), vget_high_u8(oq1));
   }
 }
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 943865b3c2..38b0b6c1a9 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -19,6 +19,24 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+// Support for these xN intrinsics is lacking in older versions of GCC.
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ < 8 || defined(__arm__)
+static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+#endif
+
+#if __GNUC__ < 9 || defined(__arm__)
+static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+#endif
+#endif
+
 static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
                                           const int16_t c2, const int16_t c3) {
   return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
@@ -84,6 +102,16 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
 #endif
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
+  vst1q_s32(buf, a);
+}
+
+static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) {
+  return vld1q_s32(buf);
+}
+#endif
+
 // Propagate type information to the compiler. Without this the compiler may
 // assume the required alignment of uint32_t (4 bytes) and add alignment hints
 // to the memory access.
@@ -94,21 +122,73 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
   memcpy(buf, &a, 4);
 }
 
+// Load 4 contiguous bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+  uint32_t a;
+  uint32x2_t a_u32;
+  memcpy(&a, buf, 4);
+  a_u32 = vdup_n_u32(0);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 4 contiguous bytes and replicate across a vector when alignment is not
+// guaranteed.
+static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) {
+  uint32_t a;
+  memcpy(&a, buf, 4);
+  return vreinterpret_u8_u32(vdup_n_u32(a));
+}
+
+// Store 4 contiguous bytes from the low half of an 8x8 vector.
+static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
+  vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
+}
+
+// Store 4 contiguous bytes from the high half of an 8x8 vector.
+static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
+  vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1);
+}
+
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
+                                          ptrdiff_t stride) {
   uint32_t a;
-  uint32x2_t a_u32 = vdup_n_u32(0);
+  uint32x2_t a_u32;
   if (stride == 4) return vld1_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vset_lane_u32(a, a_u32, 0);
+  a_u32 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   a_u32 = vset_lane_u32(a, a_u32, 1);
   return vreinterpret_u8_u32(a_u32);
 }
 
+// Load 8 bytes when alignment is not guaranteed.
+static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
+  uint64_t a;
+  uint64x1_t a_u64 = vdup_n_u64(0);
+  memcpy(&a, buf, 8);
+  a_u64 = vset_lane_u64(a, a_u64, 0);
+  return vreinterpret_u16_u64(a_u64);
+}
+
+// Load 2 sets of 8 bytes when alignment is not guaranteed.
+static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
+                                             ptrdiff_t stride) {
+  uint64_t a;
+  uint64x2_t a_u64;
+  if (stride == 4) return vld1q_u16(buf);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  a_u64 = vdupq_n_u64(a);
+  memcpy(&a, buf, 8);
+  a_u64 = vsetq_lane_u64(a, a_u64, 1);
+  return vreinterpretq_u16_u64(a_u64);
+}
+
 // Store 2 sets of 4 bytes when alignment is not guaranteed.
-static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
+static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
                                       const uint8x8_t a) {
   const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
   if (stride == 4) {
@@ -121,13 +201,14 @@ static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
 }
 
 // Load 4 sets of 4 bytes when alignment is not guaranteed.
-static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
+                                            ptrdiff_t stride) {
   uint32_t a;
-  uint32x4_t a_u32 = vdupq_n_u32(0);
+  uint32x4_t a_u32;
   if (stride == 4) return vld1q_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  a_u32 = vdupq_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -141,7 +222,7 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
 }
 
 // Store 4 sets of 4 bytes when alignment is not guaranteed.
-static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
+static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
                                        const uint8x16_t a) {
   const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
   if (stride == 4) {
@@ -158,7 +239,7 @@ static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
 }
 
 // Load 2 sets of 4 bytes when alignment is guaranteed.
-static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
+static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
   uint32x2_t a = vdup_n_u32(0);
 
   assert(!((intptr_t)buf % sizeof(uint32_t)));
@@ -171,7 +252,7 @@ static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
 }
 
 // Store 2 sets of 4 bytes when alignment is guaranteed.
-static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
+static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
   uint32x2_t a_u32 = vreinterpret_u32_u8(a);
 
   assert(!((intptr_t)buf % sizeof(uint32_t)));
@@ -181,4 +262,212 @@ static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
   buf += stride;
   vst1_lane_u32((uint32_t *)buf, a_u32, 1);
 }
+
+static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+}
+
+static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+}
+
+static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
+                                 const uint8x16_t s0, const uint8x16_t s1,
+                                 const uint8x16_t s2, const uint8x16_t s3) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3,
+                                const uint8x8_t s4, const uint8x8_t s5,
+                                const uint8x8_t s6, const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
+                                 const uint8x16_t s0, const uint8x16_t s1,
+                                 const uint8x16_t s2, const uint8x16_t s3,
+                                 const uint8x16_t s4, const uint8x16_t s5,
+                                 const uint8x16_t s6, const uint8x16_t s7) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+  s += p;
+  vst1q_u8(s, s4);
+  s += p;
+  vst1q_u8(s, s5);
+  s += p;
+  vst1q_u8(s, s6);
+  s += p;
+  vst1q_u8(s, s7);
+}
+
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+                                uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+                                uint16x8_t *s6, uint16x8_t *s7) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+  s += p;
+  *s7 = vld1q_u16(s);
+}
+
 #endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index adef5f6e15..e2351fa2cc 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,140 +14,126 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                const int16x8_t dequant,
-                                               tran_low_t *dqcoeff) {
+                                               tran_low_t *dqcoeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
   const int32x4_t dqcoeff_0 =
       vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
   const int32x4_t dqcoeff_1 =
       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  vst1q_s32(dqcoeff, dqcoeff_0);
-  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
 #else
-  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+  vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
-  const int16x8_t neg_one = vdupq_n_s16(-1);
-  uint16x8_t eob_max;
-  (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
+static INLINE int16x8_t
+quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                const int16x8_t round, const int16x8_t quant,
+                const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
 
-  // Process first 8 values which include a dc component.
-  {
-    // Only the first element of each vector is DC.
-    const int16x8_t zbin = vld1q_s16(zbin_ptr);
-    const int16x8_t round = vld1q_s16(round_ptr);
-    const int16x8_t quant = vld1q_s16(quant_ptr);
-    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
-    const int16x8_t dequant = vld1q_s16(dequant_ptr);
-    // Add one because the eob does not index from 0.
-    const uint16x8_t v_iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
 
-    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-    const int16x8_t coeff_abs = vabsq_s16(coeff);
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
 
-    const int16x8_t zbin_mask =
-        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+  qcoeff = vaddq_s16(qcoeff, rounded);
 
-    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+  // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+  qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
 
-    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
 
-    qcoeff = vaddq_s16(qcoeff, rounded);
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  int16_t const *iscan = scan_order->iscan;
 
-    // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
-    qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vld1q_s16(mb_plane->zbin);
+  int16x8_t round = vld1q_s16(mb_plane->round);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
 
-    // Restore the sign bit.
-    qcoeff = veorq_s16(qcoeff, coeff_sign);
-    qcoeff = vsubq_s16(qcoeff, coeff_sign);
+  // Process first 8 values which include a dc component.
+  {
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
-    qcoeff = vandq_s16(qcoeff, zbin_mask);
+    const int16x8_t qcoeff =
+        quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
+                        quant_shift, dequant);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
+    __builtin_prefetch(coeff_ptr + 64);
     coeff_ptr += 8;
     iscan += 8;
-
-    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
-
-    calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
   n_coeffs -= 8;
 
   {
-    const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
-    const int16x8_t round = vdupq_n_s16(round_ptr[1]);
-    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
-    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
-    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
 
     do {
-      // Add one because the eob is not its index.
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-      const int16x8_t zbin_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
-      qcoeff = vaddq_s16(qcoeff, rounded);
-
-      // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
-      qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
-      // Restore the sign bit.
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-      qcoeff = vandq_s16(qcoeff, zbin_mask);
+      const int16x8_t qcoeff =
+          quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                          quant, quant_shift, dequant);
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
+      __builtin_prefetch(coeff_ptr + 64);
       coeff_ptr += 8;
       iscan += 8;
-
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
-
-      calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
-
       n_coeffs -= 8;
     } while (n_coeffs > 0);
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -157,7 +143,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -166,7 +152,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
 
 static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
                                                      const int16x8_t dequant,
-                                                     tran_low_t *dqcoeff) {
+                                                     tran_low_t *dqcoeff_ptr) {
   int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
   int32x4_t dqcoeff_1 =
       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
@@ -178,132 +164,115 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
 #if CONFIG_VP9_HIGHBITDEPTH
   dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
   dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
-  vst1q_s32(dqcoeff, dqcoeff_0);
-  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
 #else
-  vst1q_s16(dqcoeff,
+  vst1q_s16(dqcoeff_ptr,
             vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
+static INLINE int16x8_t
+quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                      const int16x8_t round, const int16x8_t quant,
+                      const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+  qcoeff = vaddq_s16(qcoeff, rounded);
+
+  // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+  qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               int skip_block, const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
+                               const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
-  (void)scan;
-  (void)n_coeffs;  // Because we will always calculate 32*32.
-  (void)skip_block;
-  assert(!skip_block);
+  const int16_t *iscan = scan_order->iscan;
+
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
   {
-    // Only the first element of each vector is DC.
-    const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
-    const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-    const int16x8_t quant = vld1q_s16(quant_ptr);
-    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
-    const int16x8_t dequant = vld1q_s16(dequant_ptr);
-    // Add one because the eob does not index from 0.
-    const uint16x8_t v_iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
-    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-    const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-    const int16x8_t zbin_mask =
-        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
-    qcoeff = vaddq_s16(qcoeff, rounded);
-
-    // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
-    qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
-    // Restore the sign bit.
-    qcoeff = veorq_s16(qcoeff, coeff_sign);
-    qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-    qcoeff = vandq_s16(qcoeff, zbin_mask);
+    const int16x8_t qcoeff =
+        quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                              quant, quant_shift, dequant);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
+    __builtin_prefetch(coeff_ptr + 64);
     coeff_ptr += 8;
     iscan += 8;
-
-    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
-
-    calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
   {
-    const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
-    const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
-    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
-    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
-    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
 
     for (i = 1; i < 32 * 32 / 8; ++i) {
-      // Add one because the eob is not its index.
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-      const int16x8_t zbin_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
-      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-      qcoeff = vaddq_s16(qcoeff, rounded);
-
-      // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
-      qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
-      // Restore the sign bit.
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-      qcoeff = vandq_s16(qcoeff, zbin_mask);
+      const int16x8_t qcoeff =
+          quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                                quant, quant_shift, dequant);
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
+      __builtin_prefetch(coeff_ptr + 64);
       coeff_ptr += 8;
       iscan += 8;
-
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
-
-      calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
     }
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -313,5 +282,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 06443c6995..713eec7a92 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -17,364 +17,212 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
-                                                 const void *const buf1) {
-  uint32_t a;
-  uint32x2_t aa = vdup_n_u32(0);
-  memcpy(&a, buf0, 4);
-  aa = vset_lane_u32(a, aa, 0);
-  memcpy(&a, buf1, 4);
-  aa = vset_lane_u32(a, aa, 1);
-  return vreinterpret_u8_u32(aa);
-}
-
-static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
-                            const uint8_t *const ref_array[4],
-                            const int ref_stride, const int height,
-                            uint32_t *const res) {
-  int i;
-  uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
-  uint16x4_t a[2];
-  uint32x4_t r;
-
-  assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
-  assert(!(src_stride % sizeof(uint32_t)));
-
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vreinterpret_u8_u32(
-        vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
-    const uint8x8_t ref01 = load_unaligned_2_buffers(
-        ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
-    const uint8x8_t ref23 = load_unaligned_2_buffers(
-        ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
-    abs[0] = vabal_u8(abs[0], s, ref01);
-    abs[1] = vabal_u8(abs[1], s, ref23);
-  }
-
-  a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
-  a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
-  r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
-  vst1q_u32(res, r);
-}
-
-void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
-}
-
-void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
-}
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint16x8_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, s2, s3;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
 
-////////////////////////////////////////////////////////////////////////////////
-
-// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
-static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
-                                          uint32_t *const res) {
-  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
-  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
-  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
-  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
-  const uint16x4_t b0 = vpadd_u16(a0, a1);
-  const uint16x4_t b1 = vpadd_u16(a2, a3);
-  const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
-  vst1q_u32(res, r);
-}
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
 
-// Can handle 1024 pixels' sad sum (such as 32x32)
-static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
-                                           uint32_t *const res) {
-  const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
-  const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
-  const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
-  const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
-  const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
-  const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
-  const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
-  const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  vst1q_u32(res, vcombine_u32(c0, c1));
-}
+    i++;
+  } while (i < h);
 
-// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
-static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
-                                           uint32_t *const res) {
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
-  const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
-  const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
-  const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
-  const uint32x2_t c0 = vpadd_u32(b0, b1);
-  const uint32x2_t c1 = vpadd_u32(b2, b3);
-  vst1q_u32(res, vcombine_u32(c0, c1));
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
-// Can handle 4096 pixels' sad sum (such as 64x64)
-static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
-                                           uint32_t *const res) {
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
-  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
-  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
-  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
-  const uint32x4_t b0 = vaddq_u32(a0, a1);
-  const uint32x4_t b1 = vaddq_u32(a2, a3);
-  const uint32x4_t b2 = vaddq_u32(a4, a5);
-  const uint32x4_t b3 = vaddq_u32(a6, a7);
-  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
-  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
-  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
-  const uint32x2_t d0 = vpadd_u32(c0, c1);
-  const uint32x2_t d1 = vpadd_u32(c2, c3);
-  vst1q_u32(res, vcombine_u32(d0, d1));
+static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
+                             uint16x8_t *const sad_sum) {
+  uint8x8_t abs_diff = vabd_u8(src, ref);
+  *sad_sum = vaddw_u8(*sad_sum, abs_diff);
 }
 
-static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t *res, const int height) {
-  int i, j;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
 
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    for (j = 0; j < 4; ++j) {
-      const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
-      ref_loop[j] += ref_stride;
-      sum[j] = vabal_u8(sum[j], s, b_u8);
-    }
-  }
-
-  sad_512_pel_final_neon(sum, res);
-}
-
-void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
-}
-
-void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
-}
-
-void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t *res) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
-}
+  int i = 0;
+  do {
+    const uint8x8_t s = vld1_u8(src + i * src_stride);
+    sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
 
-////////////////////////////////////////////////////////////////////////////////
+    i++;
+  } while (i < h);
 
-static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
-                              uint16x8_t *const sum) {
-  const uint8x16_t r = vld1q_u8(ref_ptr);
-  *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
-  *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
-static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t *res, const int height) {
-  int i, j;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
 
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t s = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    for (j = 0; j < 4; ++j) {
-      sad16_neon(ref_loop[j], s, &sum[j]);
-      ref_loop[j] += ref_stride;
-    }
-  }
-
-  sad_512_pel_final_neon(sum, res);
-}
+  int i = 0;
+  do {
+    uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
+    uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
+    uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
+    uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
+    uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
 
-void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t *res) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
-}
+    sad8_neon(s, r0, &sum[0]);
+    sad8_neon(s, r1, &sum[1]);
+    sad8_neon(s, r2, &sum[2]);
+    sad8_neon(s, r3, &sum[3]);
 
-void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
-}
+    i += 2;
+  } while (i < h);
 
-void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             const int height, uint16x8_t *const sum) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-
-  sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
+#define SAD_WXH_4D_NEON(w, h)                                                 \
+  void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride,     \
+                                  const uint8_t *const ref_array[4],          \
+                                  int ref_stride, uint32_t sad_array[4]) {    \
+    sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \
+                       (h));                                                  \
   }
-}
-
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
-  sad_512_pel_final_neon(sum, res);
-}
 
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
-  sad_1024_pel_final_neon(sum, res);
-}
-
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
-  sad_2048_pel_final_neon(sum, res);
-}
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+
+#undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h)                                         \
+  void vpx_sad_skip_##w##x##h##x4d_neon(                                   \
+      const uint8_t *src_ptr, int src_stride,                              \
+      const uint8_t *const ref_array[4], int ref_stride,                   \
+      uint32_t sad_array[4]) {                                             \
+    sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \
+                       sad_array, ((h) >> 1));                             \
+    sad_array[0] <<= 1;                                                    \
+    sad_array[1] <<= 1;                                                    \
+    sad_array[2] <<= 1;                                                    \
+    sad_array[3] <<= 1;                                                    \
+  }
 
-////////////////////////////////////////////////////////////////////////////////
+SAD_SKIP_WXH_4D_NEON(4, 4)
+SAD_SKIP_WXH_4D_NEON(4, 8)
 
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
+SAD_SKIP_WXH_4D_NEON(8, 4)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
 
-  for (i = 0; i < 32; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
 
-  sad_2048_pel_final_neon(sum, res);
-}
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
 
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0), vdupq_n_u16(0) };
-
-  for (i = 0; i < 64; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
 
-  sad_4096_pel_final_neon(sum, res);
-}
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/vpx_dsp/arm/sad4d_neon_dotprod.c b/vpx_dsp/arm/sad4d_neon_dotprod.c
new file mode 100644
index 0000000000..933fc48b8c
--- /dev/null
+++ b/vpx_dsp/arm/sad4d_neon_dotprod.c
@@ -0,0 +1,176 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint32x4_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, s2, s3;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+#define SAD_WXH_4D_NEON_DOTPROD(w, h)                                      \
+  void vpx_sad##w##x##h##x4d_neon_dotprod(                                 \
+      const uint8_t *src_ptr, int src_stride,                              \
+      const uint8_t *const ref_array[4], int ref_stride,                   \
+      uint32_t sad_array[4]) {                                             \
+    sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \
+                               sad_array, (h));                            \
+  }
+
+SAD_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_WXH_4D_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_4D_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h)                             \
+  void vpx_sad_skip_##w##x##h##x4d_neon_dotprod(                       \
+      const uint8_t *src_ptr, int src_stride,                          \
+      const uint8_t *const ref_array[4], int ref_stride,               \
+      uint32_t sad_array[4]) {                                         \
+    sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array,     \
+                               2 * ref_stride, sad_array, ((h) >> 1)); \
+    sad_array[0] <<= 1;                                                \
+    sad_array[1] <<= 1;                                                \
+    sad_array[2] <<= 1;                                                \
+    sad_array[3] <<= 1;                                                \
+  }
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
+
+#undef SAD_SKIP_WXH_4D_NEON_DOTPROD
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index c4a49e366d..4dd87ddc0f 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -17,337 +17,375 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride) {
-  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
-  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
-}
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32;
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+    uint8x16_t diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    diff2 = vabdq_u8(s2, r2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    diff3 = vabdq_u8(s3, r3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
 
-uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride,
-                             const uint8_t *second_pred) {
-  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-  const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
-  const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
-  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
-  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
-}
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
 
-uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-  for (i = 0; i < 8; i += 4) {
-    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
-    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-  }
+  sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
 
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  return horizontal_add_uint32x4(sum_u32);
 }
 
-uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride,
-                             const uint8_t *second_pred) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-  for (i = 0; i < 8; i += 4) {
-    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-    const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
-    const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-    second_pred += 16;
-    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
-    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
-  }
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint32x4_t sum = vdupq_n_u32(0);
 
-  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
-}
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t diff0 = vabdq_u8(s0, r0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t diff1 = vabdq_u8(s1, r1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
 
-static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride,
-                               const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
 
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs = vabal_u8(abs, a_u8, b_u8);
-  }
-  return abs;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
 }
 
-static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_ptr, int ref_stride,
-                                   const uint8_t *second_pred,
-                                   const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t c_u8 = vld1_u8(second_pred);
-    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sum = vpadalq_u8(sum, diff);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 8;
-    abs = vabal_u8(abs, a_u8, avg);
-  }
-  return abs;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
 
-#define sad8xN(n)                                                              \
-  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
-                               const uint8_t *ref_ptr, int ref_stride) {       \
-    const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
-                                   const uint8_t *ref_ptr, int ref_stride,     \
-                                   const uint8_t *second_pred) {               \
-    const uint16x8_t abs =                                                     \
-        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
-  }
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
 
-sad8xN(4);
-sad8xN(8);
-sad8xN(16);
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
 
-static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
+    sum = vabal_u8(sum, s, r);
 
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
-    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
-  }
-  return abs;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
 
-static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
-    const uint8x16_t c_u8 = vld1q_u8(second_pred);
-    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
-    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
-  }
-  return abs;
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+
+    sum = vabal_u8(sum, s, r);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
 
-#define sad16xN(n)                                                            \
-  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint16x8_t abs =                                                    \
-        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint16x8_t abs =                                                    \
-        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+#define SAD_WXH_NEON(w, h)                                                   \
+  unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride) { \
+    return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h));           \
   }
 
-sad16xN(8);
-sad16xN(16);
-sad16xN(32);
-
-static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h)                                                \
+  unsigned int vpx_sad_skip_##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON(4, 4)
+SAD_SKIP_WXH_NEON(4, 8)
+
+SAD_SKIP_WXH_NEON(8, 4)
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+
+#undef SAD_SKIP_WXH_NEON
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32;
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+    uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    p1 = vld1q_u8(second_pred + 16);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    p2 = vld1q_u8(second_pred + 32);
+    avg2 = vrhaddq_u8(r2, p2);
+    diff2 = vabdq_u8(s2, avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    p3 = vld1q_u8(second_pred + 48);
+    avg3 = vrhaddq_u8(r3, p3);
+    diff3 = vabdq_u8(s3, avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
-    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
-    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
-    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
-  }
-  return abs;
+    second_pred += 64;
+  } while (--i != 0);
+
+  sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_uint32x4(sum_u32);
 }
 
-static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t c_lo = vld1q_u8(second_pred);
-    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
-    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
-    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+    uint8x16_t diff0 = vabdq_u8(s0, avg0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+    uint8x16_t diff1 = vabdq_u8(s1, avg1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
     second_pred += 32;
-    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
-    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
-    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
-    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
-  }
-  return abs;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
 }
 
-#define sad32xN(n)                                                            \
-  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint16x8_t abs =                                                    \
-        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint16x8_t abs =                                                    \
-        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
-  }
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+    uint8x16_t p = vld1q_u8(second_pred);
+
+    uint8x16_t avg = vrhaddq_u8(r, p);
+    uint8x16_t diff = vabdq_u8(s, avg);
+    sum = vpadalq_u8(sum, diff);
 
-sad32xN(16);
-sad32xN(32);
-sad32xN(64);
-
-static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint16x8_t abs_0 = vdupq_n_u16(0);
-  uint16x8_t abs_1 = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
-  }
+    second_pred += 16;
+  } while (--i != 0);
 
-  {
-    const uint32x4_t sum = vpaddlq_u16(abs_0);
-    return vpadalq_u16(sum, abs_1);
-  }
+  return horizontal_add_uint16x8(sum);
 }
 
-static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint16x8_t abs_0 = vdupq_n_u16(0);
-  uint16x8_t abs_1 = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
-    const uint8x16_t c_0 = vld1q_u8(second_pred);
-    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
-    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
-    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
-    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
-    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
-    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
-    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 64;
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
-  }
+    second_pred += 8;
+  } while (--i != 0);
 
-  {
-    const uint32x4_t sum = vpaddlq_u16(abs_0);
-    return vpadalq_u16(sum, abs_1);
-  }
+  return horizontal_add_uint16x8(sum);
 }
 
-#define sad64xN(n)                                                            \
-  uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint32x4_t abs =                                                    \
-        sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint32x4_t abs =                                                    \
-        sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+#define SAD_WXH_AVG_NEON(w, h)                                             \
+  uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       const uint8_t *second_pred) {       \
+    return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),      \
+                               second_pred);                               \
   }
 
-sad64xN(32);
-sad64xN(64);
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
+
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
+
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
+
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)
+
+#undef SAD_WXH_AVG_NEON
diff --git a/vpx_dsp/arm/sad_neon_dotprod.c b/vpx_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 0000000000..fbc0b8d75f
--- /dev/null
+++ b/vpx_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,247 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h)                                         \
+  unsigned int vpx_sad##w##x##h##_neon_dotprod(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,              \
+      int ref_stride) {                                                    \
+    return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h)                          \
+  unsigned int vpx_sad_skip_##w##x##h##_neon_dotprod(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,    \
+      int ref_stride) {                                          \
+    return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+                                       2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int w, int h,
+                                                   const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
+  uint32_t vpx_sad##w##x##h##_avg_neon_dotprod(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+                                       second_pred);                          \
+  }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c
new file mode 100644
index 0000000000..2dd57e596c
--- /dev/null
+++ b/vpx_dsp/arm/sse_neon.c
@@ -0,0 +1,188 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <stdint.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+                                 uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+  uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
+  uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
+}
+
+static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+                                uint32x4_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+                                const uint8_t *ref, int ref_stride,
+                                uint32x4_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int width, int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        j += 8;
+      } while (j < width);
+
+      src += src_stride;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+  return horizontal_add_uint32x4(sse);
+}
+
+static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_8x1_neon(src, ref, &sse);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+                     int ref_stride, int width, int height) {
+  switch (width) {
+    case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
+  }
+}
diff --git a/vpx_dsp/arm/sse_neon_dotprod.c b/vpx_dsp/arm/sse_neon_dotprod.c
new file mode 100644
index 0000000000..8777773918
--- /dev/null
+++ b/vpx_dsp/arm/sse_neon_dotprod.c
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                         uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  *sse = vdotq_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int width, int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j < width);
+
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  }
+  return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_8x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_8x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse = vdup_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x2(sse);
+}
+
+int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride, int width,
+                             int height) {
+  switch (width) {
+    case 4:
+      return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
+                                  height);
+  }
+}
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 37bfd1cd1f..d92f1615d7 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -17,168 +17,473 @@
 #include "vpx_dsp/variance.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
-static const uint8_t bilinear_filters[8][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
-
 // Process a block exactly 4 wide and a multiple of 2 high.
-static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; i += 2) {
-    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
-    const uint8x8_t src_1 =
-        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    src_ptr += 2 * src_pixels_per_line;
-    output_ptr += 8;
-  }
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    i -= 2;
+  } while (i != 0);
 }
 
 // Process a block exactly 8 wide and any height.
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; ++i) {
-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    src_ptr += src_pixels_per_line;
-    output_ptr += 8;
-  }
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+  } while (--i != 0);
 }
 
 // Process a block which is a mutiple of 16 wide and any height.
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
-                                       uint8_t *output_ptr,
-                                       unsigned int src_pixels_per_line,
-                                       int pixel_step,
-                                       unsigned int output_height,
-                                       unsigned int output_width,
-                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i, j;
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 16) {
-      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
-      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
-      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
-      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
-      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
-      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
-      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
-      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
-    }
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+                                         uint8_t *dst_ptr, int src_stride,
+                                         int pixel_step, int dst_width,
+                                         int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l =
+          vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+      uint16x8_t blend_h =
+          vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+      uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
+      uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
+      vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+                               dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+                               dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+                               dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                   int src_stride, int pixel_step,
+                                   int dst_width, int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
 }
 
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define sub_pixel_varianceNxM(n, m)                                         \
-  uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                          \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {              \
-    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                              \
-    uint8_t temp1[n * m];                                                   \
-                                                                            \
-    if (n == 4) {                                                           \
-      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else if (n == 8) {                                                    \
-      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else {                                                                \
-      var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
-                                 bilinear_filters[x_offset]);               \
-      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,                  \
-                                 bilinear_filters[y_offset]);               \
-    }                                                                       \
-    return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse);       \
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                   \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {               \
+    uint8_t tmp0[w * (h + padding)];                                     \
+    uint8_t tmp1[w * h];                                                 \
+    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                xoffset);                                \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+    return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
 
-sub_pixel_varianceNxM(4, 4);
-sub_pixel_varianceNxM(4, 8);
-sub_pixel_varianceNxM(8, 4);
-sub_pixel_varianceNxM(8, 8);
-sub_pixel_varianceNxM(8, 16);
-sub_pixel_varianceNxM(16, 8);
-sub_pixel_varianceNxM(16, 16);
-sub_pixel_varianceNxM(16, 32);
-sub_pixel_varianceNxM(32, 16);
-sub_pixel_varianceNxM(32, 32);
-sub_pixel_varianceNxM(32, 64);
-sub_pixel_varianceNxM(64, 32);
-sub_pixel_varianceNxM(64, 64);
-
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define sub_pixel_avg_varianceNxM(n, m)                                     \
-  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(                      \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                \
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                  \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                       \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {               \
+    if (xoffset == 0) {                                                      \
+      if (yoffset == 0) {                                                    \
+        return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);      \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      } else {                                                               \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,     \
+                                    yoffset);                                \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      }                                                                      \
+    } else if (xoffset == 4) {                                               \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);              \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    } else {                                                                 \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);   \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    }                                                                        \
+  }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+    second_pred += 8;
+  } while (--i > 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l =
+          vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+      uint16x8_t blend_h =
+          vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 16, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 32, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 64, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+                                            uint8_t *dst_ptr, int src_stride,
+                                            int pixel_step, int dst_width,
+                                            int dst_height,
+                                            const uint8_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      avg = vrhaddq_u8(avg, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of vpx_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+                     int dst_width, int dst_height,
+                     const uint8_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src_ptr + j);
+      uint8x16_t p = vld1q_u8(second_pred);
+
+      uint8x16_t avg = vrhaddq_u8(s, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
       const uint8_t *second_pred) {                                         \
-    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                              \
-    uint8_t temp1[n * m];                                                   \
-                                                                            \
-    if (n == 4) {                                                           \
-      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else if (n == 8) {                                                    \
-      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else {                                                                \
-      var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
-                                 bilinear_filters[x_offset]);               \
-      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,                  \
-                                 bilinear_filters[y_offset]);               \
-    }                                                                       \
-                                                                            \
-    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);                  \
-                                                                            \
-    return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse);       \
+    uint8_t tmp0[w * (h + padding)];                                        \
+    uint8_t tmp1[w * h];                                                    \
+    var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+                                xoffset);                                   \
+    avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,      \
+                                         second_pred);                      \
+    return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
+  }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                     \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp[w * h];                                                      \
+      if (yoffset == 0) {                                                      \
+        avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
+                                        source_stride, w, h, second_pred);     \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else {                                                                 \
+        avg_pred_var_filter_block2d_bil_w##w(                                  \
+            src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
+                                        second_pred);                          \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    } else {                                                                   \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
+                                             xoffset, second_pred);            \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    }                                                                          \
   }
 
-sub_pixel_avg_varianceNxM(4, 4);
-sub_pixel_avg_varianceNxM(4, 8);
-sub_pixel_avg_varianceNxM(8, 4);
-sub_pixel_avg_varianceNxM(8, 8);
-sub_pixel_avg_varianceNxM(8, 16);
-sub_pixel_avg_varianceNxM(16, 8);
-sub_pixel_avg_varianceNxM(16, 16);
-sub_pixel_avg_varianceNxM(16, 32);
-sub_pixel_avg_varianceNxM(32, 16);
-sub_pixel_avg_varianceNxM(32, 32);
-sub_pixel_avg_varianceNxM(32, 64);
-sub_pixel_avg_varianceNxM(64, 32);
-sub_pixel_avg_varianceNxM(64, 64);
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
index 612897e247..2c008e48ab 100644
--- a/vpx_dsp/arm/subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -79,3 +79,59 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
     } while (r);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr,
+                                    ptrdiff_t diff_stride,
+                                    const uint8_t *src8_ptr,
+                                    ptrdiff_t src_stride,
+                                    const uint8_t *pred8_ptr,
+                                    ptrdiff_t pred_stride, int bd) {
+  int r = rows, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
+  (void)bd;
+
+  if (cols >= 16) {
+    do {
+      for (c = 0; c < cols; c += 16) {
+        const uint16x8_t s0 = vld1q_u16(&src[c + 0]);
+        const uint16x8_t s1 = vld1q_u16(&src[c + 8]);
+        const uint16x8_t p0 = vld1q_u16(&pred[c + 0]);
+        const uint16x8_t p1 = vld1q_u16(&pred[c + 8]);
+        const uint16x8_t d0 = vsubq_u16(s0, p0);
+        const uint16x8_t d1 = vsubq_u16(s1, p1);
+        vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0));
+        vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  } else if (cols >= 8) {
+    do {
+      for (c = 0; c < cols; c += 8) {
+        const uint16x8_t s = vld1q_u16(&src[c]);
+        const uint16x8_t p = vld1q_u16(&pred[c]);
+        const uint16x8_t d0 = vsubq_u16(s, p);
+        vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  } else if (cols >= 4) {
+    do {
+      for (c = 0; c < cols; c += 4) {
+        const uint16x4_t s = vld1_u16(&src[c]);
+        const uint16x4_t p = vld1_u16(&pred[c]);
+        const uint16x4_t v_diff = vsub_u16(s, p);
+        vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 9e6833aad3..11821dc10e 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -16,23 +16,260 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) {
+static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t b = vpaddl_u8(a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t b = vpaddl_u8(a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  const uint16x4_t d = vpadd_u16(c, c);
+  return vget_lane_u16(d, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u8(a);
+#else
+  const uint16x8_t b = vpaddlq_u8(a);
+  const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b));
+  const uint16x4_t d = vpadd_u16(c, c);
+  const uint16x4_t e = vpadd_u16(d, d);
+  return vget_lane_u16(e, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddv_u16(a);
+#else
+  const uint16x4_t b = vpadd_u16(a, a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_s16(a);
+#else
   const int32x4_t b = vpaddlq_s16(a);
   const int64x2_t c = vpaddlq_s32(b);
-  return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
-                  vreinterpret_s32_s64(vget_high_s64(c)));
+  const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+                               vreinterpret_s32_s64(vget_high_s64(c)));
+  return vget_lane_s32(d, 0);
+#endif
 }
 
-static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) {
+static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u16(a);
+#else
   const uint32x4_t b = vpaddlq_u16(a);
   const uint64x2_t c = vpaddlq_u32(b);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
-                  vreinterpret_u32_u64(vget_high_u64(c)));
+  const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                                vreinterpret_u32_u64(vget_high_u64(c)));
+  return vget_lane_u32(d, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
+#if VPX_ARCH_AARCH64
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  return vpaddlq_u16(b0);
+#else
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
+static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
+    const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
+  const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
+  const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
+  const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
+  const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
+  const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
+#if VPX_ARCH_AARCH64
+  const uint32x4_t c0 = vpaddq_u32(b0, b1);
+  const uint32x4_t c1 = vpaddq_u32(b2, b3);
+  return vpaddq_u32(c0, c1);
+#else
+  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+  const uint32x2_t d0 = vpadd_u32(c0, c1);
+  const uint32x2_t d1 = vpadd_u32(c2, c3);
+  return vcombine_u32(d0, d1);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddv_s32(a);
+#else
+  return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_s32(a);
+#else
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+#endif
 }
 
-static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_u32(a);
+#else
   const uint64x2_t b = vpaddlq_u32(a);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                  vreinterpret_u32_u64(vget_high_u64(b)));
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
 }
+
+static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
+#if VPX_ARCH_AARCH64
+  uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+  uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+  return vpaddq_u32(res01, res23);
+#else
+  uint32x4_t res = vdupq_n_u32(0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
+  return res;
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_s64(a);
+#else
+  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_u64(a);
+#else
+  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) {
+  return horizontal_long_add_uint32x4(a[0]) +
+         horizontal_long_add_uint32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) {
+  uint64x2_t sum = vpaddlq_u32(a[0]);
+  sum = vpadalq_u32(sum, a[1]);
+  sum = vpadalq_u32(sum, a[2]);
+  sum = vpadalq_u32(sum, a[3]);
+
+  return horizontal_add_uint64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+
+  return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t
+horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+  sum[0] = vpadalq_u32(sum[0], a[8]);
+  sum[1] = vpadalq_u32(sum[1], a[9]);
+  sum[0] = vpadalq_u32(sum[0], a[10]);
+  sum[1] = vpadalq_u32(sum[1], a[11]);
+  sum[0] = vpadalq_u32(sum[0], a[12]);
+  sum[1] = vpadalq_u32(sum[1], a[13]);
+  sum[0] = vpadalq_u32(sum[0], a[14]);
+  sum[1] = vpadalq_u32(sum[1], a[15]);
+
+  return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
diff --git a/vpx_dsp/arm/sum_squares_neon.c b/vpx_dsp/arm/sum_squares_neon.c
index cfefad9938..074afe3258 100644
--- a/vpx_dsp/arm/sum_squares_neon.c
+++ b/vpx_dsp/arm/sum_squares_neon.c
@@ -9,77 +9,92 @@
  */
 
 #include <arm_neon.h>
-
 #include <assert.h>
+
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
 uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) {
-  uint64x1_t s2;
-
   if (size == 4) {
     int16x4_t s[4];
-    int32x4_t s0;
-    uint32x2_t s1;
+    int32x4_t sum_s32;
 
     s[0] = vld1_s16(src + 0 * stride);
     s[1] = vld1_s16(src + 1 * stride);
     s[2] = vld1_s16(src + 2 * stride);
     s[3] = vld1_s16(src + 3 * stride);
-    s0 = vmull_s16(s[0], s[0]);
-    s0 = vmlal_s16(s0, s[1], s[1]);
-    s0 = vmlal_s16(s0, s[2], s[2]);
-    s0 = vmlal_s16(s0, s[3], s[3]);
-    s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)),
-                   vget_high_u32(vreinterpretq_u32_s32(s0)));
-    s2 = vpaddl_u32(s1);
+
+    sum_s32 = vmull_s16(s[0], s[0]);
+    sum_s32 = vmlal_s16(sum_s32, s[1], s[1]);
+    sum_s32 = vmlal_s16(sum_s32, s[2], s[2]);
+    sum_s32 = vmlal_s16(sum_s32, s[3], s[3]);
+
+    return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32));
   } else {
-    int r = size;
-    uint64x2_t s1 = vdupq_n_u64(0);
+    uint64x2_t sum_u64 = vdupq_n_u64(0);
+    int rows = size;
 
     do {
-      int c = size;
-      int32x4_t s0 = vdupq_n_s32(0);
-      const int16_t *src_t = src;
+      const int16_t *src_ptr = src;
+      int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+      int cols = size;
 
       do {
         int16x8_t s[8];
 
-        s[0] = vld1q_s16(src_t + 0 * stride);
-        s[1] = vld1q_s16(src_t + 1 * stride);
-        s[2] = vld1q_s16(src_t + 2 * stride);
-        s[3] = vld1q_s16(src_t + 3 * stride);
-        s[4] = vld1q_s16(src_t + 4 * stride);
-        s[5] = vld1q_s16(src_t + 5 * stride);
-        s[6] = vld1q_s16(src_t + 6 * stride);
-        s[7] = vld1q_s16(src_t + 7 * stride);
-        s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7]));
-        src_t += 8;
-        c -= 8;
-      } while (c);
+        s[0] = vld1q_s16(src_ptr + 0 * stride);
+        s[1] = vld1q_s16(src_ptr + 1 * stride);
+        s[2] = vld1q_s16(src_ptr + 2 * stride);
+        s[3] = vld1q_s16(src_ptr + 3 * stride);
+        s[4] = vld1q_s16(src_ptr + 4 * stride);
+        s[5] = vld1q_s16(src_ptr + 5 * stride);
+        s[6] = vld1q_s16(src_ptr + 6 * stride);
+        s[7] = vld1q_s16(src_ptr + 7 * stride);
 
-      s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0)));
-      s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0)));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7]));
+
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7]));
+
+        src_ptr += 8;
+        cols -= 8;
+      } while (cols);
+
+      sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0]));
+      sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1]));
       src += 8 * stride;
-      r -= 8;
-    } while (r);
+      rows -= 8;
+    } while (rows);
 
-    s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1));
+    return horizontal_add_uint64x2(sum_u64);
   }
-
-  return vget_lane_u64(s2, 0);
 }
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 752308160d..74f85a6bb6 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -23,44 +23,77 @@
 // b0.val[1]: 04 05 06 07 20 21 22 23
 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_s16_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s16_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                            vreinterpret_s16_s32(vget_low_s32(a1)));
   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
                            vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
   return b0;
 }
 
 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
   int32x4x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_s32_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s32_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
   return b0;
 }
 
 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
   int64x2x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+  b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+#else
   b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
                            vreinterpret_s64_s32(vget_low_s32(a1)));
   b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
                            vreinterpret_s64_s32(vget_high_s32(a1)));
+#endif
   return b0;
 }
 
 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
   uint8x16x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_u8_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u8_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
   b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
                           vreinterpret_u8_u32(vget_low_u32(a1)));
   b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
                           vreinterpret_u8_u32(vget_high_u32(a1)));
+#endif
   return b0;
 }
 
 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
   uint16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
                            vreinterpret_u16_u32(vget_low_u32(a1)));
   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
                            vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
   return b0;
 }
 
@@ -141,17 +174,13 @@ static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
   // c0: 00 01 20 21  02 03 22 23
   // c1: 10 11 30 31  12 13 32 33
 
-  const int32x4_t c0 =
-      vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
-  const int32x4_t c1 =
-      vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
+  const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]);
 
   // Swap 16 bit elements resulting in:
   // d0.val[0]: 00 10 20 30  02 12 22 32
   // d0.val[1]: 01 11 21 31  03 13 23 33
 
-  const int16x8x2_t d0 =
-      vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
+  const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d0.val[1];
@@ -172,17 +201,13 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
   // c0: 00 01 20 21  02 03 22 23
   // c1: 10 11 30 31  12 13 32 33
 
-  const uint32x4_t c0 =
-      vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
-  const uint32x4_t c1 =
-      vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
+  const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]);
 
   // Swap 16 bit elements resulting in:
   // d0.val[0]: 00 10 20 30  02 12 22 32
   // d0.val[1]: 01 11 21 31  03 13 23 33
 
-  const uint16x8x2_t d0 =
-      vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
+  const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d0.val[1];
@@ -281,7 +306,7 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
                                      const int16x4_t a6, const int16x4_t a7,
                                      int16x8_t *const o0, int16x8_t *const o1,
                                      int16x8_t *const o2, int16x8_t *const o3) {
-  // Swap 16 bit elements. Goes from:
+  // Combine rows. Goes from:
   // a0: 00 01 02 03
   // a1: 10 11 12 13
   // a2: 20 21 22 23
@@ -291,53 +316,40 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
   // a6: 60 61 62 63
   // a7: 70 71 72 73
   // to:
-  // b0.val[0]: 00 10 02 12
-  // b0.val[1]: 01 11 03 13
-  // b1.val[0]: 20 30 22 32
-  // b1.val[1]: 21 31 23 33
-  // b2.val[0]: 40 50 42 52
-  // b2.val[1]: 41 51 43 53
-  // b3.val[0]: 60 70 62 72
-  // b3.val[1]: 61 71 63 73
+  // b0: 00 01 02 03 40 41 42 43
+  // b1: 10 11 12 13 50 51 52 53
+  // b2: 20 21 22 23 60 61 62 63
+  // b3: 30 31 32 33 70 71 72 73
+
+  const int16x8_t b0 = vcombine_s16(a0, a4);
+  const int16x8_t b1 = vcombine_s16(a1, a5);
+  const int16x8_t b2 = vcombine_s16(a2, a6);
+  const int16x8_t b3 = vcombine_s16(a3, a7);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 02 12 40 50 42 52
+  // c0.val[1]: 01 11 03 13 41 51 43 53
+  // c1.val[0]: 20 30 22 32 60 70 62 72
+  // c1.val[1]: 21 31 23 33 61 71 63 73
 
-  const int16x4x2_t b0 = vtrn_s16(a0, a1);
-  const int16x4x2_t b1 = vtrn_s16(a2, a3);
-  const int16x4x2_t b2 = vtrn_s16(a4, a5);
-  const int16x4x2_t b3 = vtrn_s16(a6, a7);
+  const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+  const int16x8x2_t c1 = vtrnq_s16(b2, b3);
 
   // Swap 32 bit elements resulting in:
-  // c0.val[0]: 00 10 20 30
-  // c0.val[1]: 02 12 22 32
-  // c1.val[0]: 01 11 21 31
-  // c1.val[1]: 03 13 23 33
-  // c2.val[0]: 40 50 60 70
-  // c2.val[1]: 42 52 62 72
-  // c3.val[0]: 41 51 61 71
-  // c3.val[1]: 43 53 63 73
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 02 12 22 32 42 52 62 72
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 03 13 23 33 43 53 63 73
 
-  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
-                                  vreinterpret_s32_s16(b1.val[0]));
-  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
-                                  vreinterpret_s32_s16(b1.val[1]));
-  const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
-                                  vreinterpret_s32_s16(b3.val[0]));
-  const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
-                                  vreinterpret_s32_s16(b3.val[1]));
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
 
-  // Swap 64 bit elements resulting in:
-  // o0: 00 10 20 30 40 50 60 70
-  // o1: 01 11 21 31 41 51 61 71
-  // o2: 02 12 22 32 42 52 62 72
-  // o3: 03 13 23 33 43 53 63 73
-
-  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
-                     vreinterpret_s16_s32(c2.val[0]));
-  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
-                     vreinterpret_s16_s32(c3.val[0]));
-  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
-                     vreinterpret_s16_s32(c2.val[1]));
-  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
-                     vreinterpret_s16_s32(c3.val[1]));
+  *o0 = vreinterpretq_s16_s32(d0.val[0]);
+  *o1 = vreinterpretq_s16_s32(d1.val[0]);
+  *o2 = vreinterpretq_s16_s32(d0.val[1]);
+  *o3 = vreinterpretq_s16_s32(d1.val[1]);
 }
 
 static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
@@ -568,6 +580,76 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
 
+// Transpose 8x8 to a new location.
+static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  out[0] = d0.val[0];
+  out[1] = d1.val[0];
+  out[2] = d2.val[0];
+  out[3] = d3.val[0];
+  out[4] = d0.val[1];
+  out[5] = d1.val[1];
+  out[6] = d2.val[1];
+  out[7] = d3.val[1];
+}
+
 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                                      int16x8_t *a2, int16x8_t *a3,
                                      int16x8_t *a4, int16x8_t *a5,
@@ -624,6 +706,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
+
   const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
   const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
   const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
@@ -695,6 +778,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
+
   const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
   const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
   const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
@@ -787,6 +871,113 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
   a7->val[1] = c7.val[1];
 }
 
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+                                       int32x4_t *right /*[8]*/,
+                                       int32x4_t *out_left /*[8]*/,
+                                       int32x4_t *out_right /*[8]*/) {
+  int32x4x2_t out[8];
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  out_left[0] = out[0].val[0];
+  out_left[1] = out[1].val[0];
+  out_left[2] = out[2].val[0];
+  out_left[3] = out[3].val[0];
+  out_left[4] = out[4].val[0];
+  out_left[5] = out[5].val[0];
+  out_left[6] = out[6].val[0];
+  out_left[7] = out[7].val[0];
+  out_right[0] = out[0].val[1];
+  out_right[1] = out[1].val[1];
+  out_right[2] = out[2].val[1];
+  out_right[3] = out[3].val[1];
+  out_right[4] = out[4].val[1];
+  out_right[5] = out[5].val[1];
+  out_right[6] = out[6].val[1];
+  out_right[7] = out[7].val[1];
+}
+
+static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1,
+                                       int32x4_t *left2, int32x4_t *right2) {
+  int32x4_t tl[16], tr[16];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  tl[0] = left1[8];
+  tl[1] = left1[9];
+  tl[2] = left1[10];
+  tl[3] = left1[11];
+  tl[4] = left1[12];
+  tl[5] = left1[13];
+  tl[6] = left1[14];
+  tl[7] = left1[15];
+  tr[0] = right1[8];
+  tr[1] = right1[9];
+  tr[2] = right1[10];
+  tr[3] = right1[11];
+  tr[4] = right1[12];
+  tr[5] = right1[13];
+  tr[6] = right1[14];
+  tr[7] = right1[15];
+
+  left1[8] = left2[0];
+  left1[9] = left2[1];
+  left1[10] = left2[2];
+  left1[11] = left2[3];
+  left1[12] = left2[4];
+  left1[13] = left2[5];
+  left1[14] = left2[6];
+  left1[15] = left2[7];
+  right1[8] = right2[0];
+  right1[9] = right2[1];
+  right1[10] = right2[2];
+  right1[11] = right2[3];
+  right1[12] = right2[4];
+  right1[13] = right2[5];
+  right1[14] = right2[6];
+  right1[15] = right2[7];
+
+  left2[0] = tl[0];
+  left2[1] = tl[1];
+  left2[2] = tl[2];
+  left2[3] = tl[3];
+  left2[4] = tl[4];
+  left2[5] = tl[5];
+  left2[6] = tl[6];
+  left2[7] = tl[7];
+  right2[0] = tr[0];
+  right2[1] = tr[1];
+  right2[2] = tr[2];
+  right2[3] = tr[3];
+  right2[4] = tr[4];
+  right2[5] = tr[5];
+  right2[6] = tr[6];
+  right2[7] = tr[7];
+
+  transpose_s32_8x8_2(left1, right1, left1, right1);
+  transpose_s32_8x8_2(left2, right2, left2, right2);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8);
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
@@ -1184,6 +1375,45 @@ static INLINE void transpose_u8_16x16(
   *o15 = e7.val[1];
 }
 
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+  int16x8_t t[8];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  t[0] = in0[8];
+  t[1] = in0[9];
+  t[2] = in0[10];
+  t[3] = in0[11];
+  t[4] = in0[12];
+  t[5] = in0[13];
+  t[6] = in0[14];
+  t[7] = in0[15];
+  in0[8] = in1[0];
+  in0[9] = in1[1];
+  in0[10] = in1[2];
+  in0[11] = in1[3];
+  in0[12] = in1[4];
+  in0[13] = in1[5];
+  in0[14] = in1[6];
+  in0[15] = in1[7];
+  in1[0] = t[0];
+  in1[1] = t[1];
+  in1[2] = t[2];
+  in1[3] = t[3];
+  in1[4] = t[4];
+  in1[5] = t[5];
+  in1[6] = t[6];
+  in1[7] = t[7];
+
+  transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+                    &in0[6], &in0[7]);
+  transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+                    &in0[14], &in0[15]);
+  transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+                    &in1[6], &in1[7]);
+  transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+                    &in1[14], &in1[15]);
+}
+
 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
                                              const int a_stride, uint8x8_t *a0,
                                              uint8x8_t *a1, uint8x8_t *a2,
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 77b1015b74..efb2c1d8da 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -19,362 +19,314 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-// The variance helper functions use int16_t for sum. 8 values are accumulated
-// and then added (at which point they expand up to int32_t). To avoid overflow,
-// there can be no more than 32767 / 255 ~= 128 values accumulated in each
-// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
-// rows = 128. Asserts have been added to each function to warn against reaching
-// this limit.
-
-// Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i;
+// Process a block of width 4 two rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
   int16x8_t sum_s16 = vdupq_n_s16(0);
-  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
-  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+  int i = h;
 
-  // Since width is only 4, sum_s16 only loads a half row per loop.
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
   assert(h <= 256);
 
-  for (i = 0; i < h; i += 4) {
-    const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-    const uint16x8_t diff_lo_u16 =
-        vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
-    const uint16x8_t diff_hi_u16 =
-        vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
-    const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
-    const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
-    sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+  do {
+    const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
 
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
-                           vget_low_s16(diff_lo_s16));
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
-                           vget_high_s16(diff_lo_s16));
+    sum_s16 = vaddq_s16(sum_s16, diff);
 
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
-                           vget_low_s16(diff_hi_s16));
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
-                           vget_high_s16(diff_hi_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
 
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-  }
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
 
-  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
-  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
-                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
-                       0);
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_int32x4(sse_s32);
 }
 
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr, int ref_stride, int w,
-                              int h, uint32_t *sse, int *sum) {
-  int i, j;
+// Process a block of width 8 one row at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
   int16x8_t sum_s16 = vdupq_n_s16(0);
-  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
-  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
-
-  // The loop loads 16 values at a time but doubles them up when accumulating
-  // into sum_s16.
-  assert(w / 8 * h <= 128);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 16) {
-      const uint8x16_t a_u8 = vld1q_u8(src_ptr + j);
-      const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j);
-
-      const uint16x8_t diff_lo_u16 =
-          vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
-      const uint16x8_t diff_hi_u16 =
-          vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
-      const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
-      const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
-      sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-      sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
-      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
-                             vget_low_s16(diff_lo_s16));
-      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
-                             vget_high_s16(diff_lo_s16));
-
-      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
-                             vget_low_s16(diff_hi_s16));
-      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
-                             vget_high_s16(diff_hi_s16));
-    }
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int i = h;
+
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128
+  assert(h <= 128);
+
+  do {
+    const uint8x8_t s = vld1_u8(src_ptr);
+    const uint8x8_t r = vld1_u8(ref_ptr);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
-  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
-  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
-                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
-                       0);
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i = 0;
-  int16x8_t sum_s16 = vdupq_n_s16(0);
-  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
-  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      int h, uint32_t *sse, int *sum) {
+  int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int i = h;
 
-  // Each column has it's own accumulator entry in sum_s16.
+  // Number of rows we can process before 'sum_s16' accumulators overflow:
+  // 32767 / 255 ~= 128, so 128 16-wide rows.
   assert(h <= 128);
 
   do {
-    const uint8x8_t a_0_u8 = vld1_u8(src_ptr);
-    const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride);
-    const uint8x8_t b_0_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride);
-    const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8);
-    const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8);
-    const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16);
-    const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16);
-    sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16),
-                           vget_low_s16(diff_0_s16));
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16),
-                           vget_low_s16(diff_1_s16));
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16),
-                           vget_high_s16(diff_0_s16));
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16),
-                           vget_high_s16(diff_1_s16));
-    src_ptr += src_stride + src_stride;
-    ref_ptr += ref_stride + ref_stride;
-    i += 2;
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const int16x8_t diff_l =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+    const int16x8_t diff_h =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+    sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+    sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int w, int h, int h_limit,
+                                       unsigned int *sse, int *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+  // accumulator overflows. After hitting this limit we accumulate into 32-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+    do {
+      int j = 0;
+      do {
+        const uint8x16_t s = vld1q_u8(src_ptr + j);
+        const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+        const int16x8_t diff_l =
+            vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+        const int16x8_t diff_h =
+            vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+        sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+        sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+        j += 16;
+      } while (j < w);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+    h_tmp += h_limit;
   } while (i < h);
 
-  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
-  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
-                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
-                       0);
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
 }
 
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *ref_ptr, int ref_stride,
                         unsigned int *sse, int *sum) {
-  variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
+  variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
 }
 
 void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *ref_ptr, int ref_stride,
                           unsigned int *sse, int *sum) {
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
+  variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum);
 }
 
-#define varianceNxM(n, m, shift)                                             \
-  unsigned int vpx_variance##n##x##m##_neon(                                 \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride, unsigned int *sse) {                                   \
-    int sum;                                                                 \
-    if (n == 4)                                                              \
-      variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse,   \
-                         &sum);                                              \
-    else if (n == 8)                                                         \
-      variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse,   \
-                         &sum);                                              \
-    else                                                                     \
-      variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \
-                        &sum);                                               \
-    if (n * m < 16 * 16)                                                     \
-      return *sse - ((sum * sum) >> shift);                                  \
-    else                                                                     \
-      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
+#define VARIANCE_WXH_NEON(w, h, shift)                                        \
+  unsigned int vpx_variance##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum);    \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
   }
 
-varianceNxM(4, 4, 4);
-varianceNxM(4, 8, 5);
-varianceNxM(8, 4, 5);
-varianceNxM(8, 8, 6);
-varianceNxM(8, 16, 7);
-varianceNxM(16, 8, 7);
-varianceNxM(16, 16, 8);
-varianceNxM(16, 32, 9);
-varianceNxM(32, 16, 9);
-varianceNxM(32, 32, 10);
-
-unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1,
-                    &sum1);
-  variance_neon_w16(src_ptr + (32 * src_stride), src_stride,
-                    ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2,
-                    &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
 
-unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
-                    &sum1);
-  variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
-                    ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
 
-unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
-                    &sum1);
-  variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
-                    ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride,
-                    ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride,
-                    ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
 
-unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
-                               const unsigned char *ref_ptr, int ref_stride,
-                               unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  int64x1_t d0s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  q7s32 = vdupq_n_s32(0);
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-    q0u8 = vld1q_u8(src_ptr);
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
+
+#undef VARIANCE_WXH_NEON
+
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+                                           int src_stride,
+                                           const unsigned char *ref_ptr,
+                                           int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x8_t s0, s1, r0, r1, diff0, diff1;
+    uint16x8_t sse0, sse1;
+
+    s0 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    q1u8 = vld1q_u8(src_ptr);
+    s1 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    q2u8 = vld1q_u8(ref_ptr);
+    r0 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q3u8 = vld1q_u8(ref_ptr);
+    r1 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
 
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
+    diff0 = vabd_u8(s0, r0);
+    diff1 = vabd_u8(s1, r1);
 
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q10s32 = vaddq_s32(q7s32, q9s32);
+    sse0 = vmull_u8(diff0, diff0);
+    sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+    sse1 = vmull_u8(diff1, diff1);
+    sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+  } while (--i != 0);
 
-  q1s64 = vpaddlq_s32(q10s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+                                            int src_stride,
+                                            const unsigned char *ref_ptr,
+                                            int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s, r, diff;
+    uint16x8_t sse0, sse1;
+
+    s = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    r = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
 
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+    diff = vabdq_u8(s, r);
+
+    sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff));
+    sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+    sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff));
+    sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
                                    int ref_stride) {
-  int16x4_t d22s16, d24s16, d26s16, d28s16;
-  int64x1_t d0s64;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  d0u8 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d4u8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  d1u8 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d5u8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  d2u8 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d6u8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  d3u8 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d7u8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-
-  q11u16 = vsubl_u8(d0u8, d4u8);
-  q12u16 = vsubl_u8(d1u8, d5u8);
-  q13u16 = vsubl_u8(d2u8, d6u8);
-  q14u16 = vsubl_u8(d3u8, d7u8);
-
-  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
-
-  q7s32 = vmull_s16(d22s16, d22s16);
-  q8s32 = vmull_s16(d24s16, d24s16);
-  q9s32 = vmull_s16(d26s16, d26s16);
-  q10s32 = vmull_s16(d28s16, d28s16);
-
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q9s32 = vaddq_s32(q7s32, q9s32);
-
-  q1s64 = vpaddlq_s32(q9s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+  uint8x8_t s[2], r[2];
+  uint16x8_t abs_diff[2];
+  uint32x4_t sse;
+
+  s[0] = load_u8(src_ptr, src_stride);
+  r[0] = load_u8(ref_ptr, ref_stride);
+  src_ptr += 2 * src_stride;
+  ref_ptr += 2 * ref_stride;
+  s[1] = load_u8(src_ptr, src_stride);
+  r[1] = load_u8(ref_ptr, ref_stride);
+
+  abs_diff[0] = vabdl_u8(s[0], r[0]);
+  abs_diff[1] = vabdl_u8(s[1], r[1]);
+
+  sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0]));
+  sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0]));
+  sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1]));
+  sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1]));
+
+  return horizontal_add_uint32x4(sse);
 }
+
+#define VPX_MSE_WXH_NEON(w, h)                                               \
+  unsigned int vpx_mse##w##x##h##_neon(                                      \
+      const unsigned char *src_ptr, int src_stride,                          \
+      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) {     \
+    *sse = vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+    return *sse;                                                             \
+  }
+
+VPX_MSE_WXH_NEON(8, 8)
+VPX_MSE_WXH_NEON(8, 16)
+VPX_MSE_WXH_NEON(16, 8)
+VPX_MSE_WXH_NEON(16, 16)
+
+#undef VPX_MSE_WXH_NEON
diff --git a/vpx_dsp/arm/variance_neon_dotprod.c b/vpx_dsp/arm/variance_neon_dotprod.c
new file mode 100644
index 0000000000..ab843e9fca
--- /dev/null
+++ b/vpx_dsp/arm/variance_neon_dotprod.c
@@ -0,0 +1,298 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+// Process a block of width 4 four rows at a time.
+static INLINE void variance_4xh_neon_dotprod(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+    i -= 4;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon_dotprod(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s =
+        vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+    const uint8x16_t r =
+        vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon_dotprod(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h,
+                                               uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint8x16_t s = vld1q_u8(src_ptr + j);
+      const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+      const uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      j += 16;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse,
+                              sum);
+}
+
+static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse,
+                              sum);
+}
+
+void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                unsigned int *sse, int *sum) {
+  variance_8xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse,
+                            sum);
+}
+
+void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse, int *sum) {
+  variance_16xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse,
+                             sum);
+}
+
+#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift)                                \
+  unsigned int vpx_variance##w##x##h##_neon_dotprod(                          \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse,   \
+                                  &sum);                                      \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
+  }
+
+VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
+VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
+
+VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
+VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
+
+VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
+VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
+
+VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
+VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
+
+VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
+VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
+
+#undef VARIANCE_WXH_NEON_DOTPROD
+
+static INLINE unsigned int vpx_mse8xh_neon_dotprod(const unsigned char *src_ptr,
+                                                   int src_stride,
+                                                   const unsigned char *ref_ptr,
+                                                   int ref_stride, int h) {
+  uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x8_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabd_u8(s0, r0);
+    diff1 = vabd_u8(s1, r1);
+
+    sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
+}
+
+static INLINE unsigned int vpx_mse16xh_neon_dotprod(
+    const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr,
+    int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabdq_u8(s0, r0);
+    diff1 = vabdq_u8(s1, r1);
+
+    sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr,
+                                           int src_stride,
+                                           const unsigned char *ref_ptr,
+                                           int ref_stride) {
+  uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+  uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+#define VPX_MSE_WXH_NEON_DOTPROD(w, h)                                   \
+  unsigned int vpx_mse##w##x##h##_neon_dotprod(                          \
+      const unsigned char *src_ptr, int src_stride,                      \
+      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \
+    *sse = vpx_mse##w##xh_neon_dotprod(src_ptr, src_stride, ref_ptr,     \
+                                       ref_stride, h);                   \
+    return *sse;                                                         \
+  }
+
+VPX_MSE_WXH_NEON_DOTPROD(8, 8)
+VPX_MSE_WXH_NEON_DOTPROD(8, 16)
+VPX_MSE_WXH_NEON_DOTPROD(16, 8)
+VPX_MSE_WXH_NEON_DOTPROD(16, 16)
+
+#undef VPX_MSE_WXH_NEON_DOTPROD
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 08ae17dbab..65fb67c984 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -14,8 +14,10 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 // Note:
@@ -30,71 +32,98 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
-                                const uint8x8_t s0, const uint8x8_t s1,
-                                const uint8x8_t s2, const uint8x8_t s3,
-                                const uint8x8_t s4, const uint8x8_t s5,
-                                const uint8x8_t s6, const uint8x8_t s7) {
-  vst1_u8(s, s0);
-  s += p;
-  vst1_u8(s, s1);
-  s += p;
-  vst1_u8(s, s2);
-  s += p;
-  vst1_u8(s, s3);
-  s += p;
-  vst1_u8(s, s4);
-  s += p;
-  vst1_u8(s, s5);
-  s += p;
-  vst1_u8(s, s6);
-  s += p;
-  vst1_u8(s, s7);
-}
+static INLINE void vpx_convolve_4tap_horiz_neon(const uint8_t *src,
+                                                ptrdiff_t src_stride,
+                                                uint8_t *dst,
+                                                ptrdiff_t dst_stride, int w,
+                                                int h, const int16x4_t filter) {
+  if (w == 4) {
+    do {
+      int16x4_t s0[4], s1[4];
 
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *filter, int x0_q4,
-                              int x_step_q4, int y0_q4, int y_step_q4, int w,
-                              int h) {
-  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
-  uint8x8_t t0, t1, t2, t3;
+      int16x8_t t0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src)));
+      s0[0] = vget_low_s16(vextq_s16(t0, t0, 0));
+      s0[1] = vget_low_s16(vextq_s16(t0, t0, 1));
+      s0[2] = vget_low_s16(vextq_s16(t0, t0, 2));
+      s0[3] = vget_low_s16(vextq_s16(t0, t0, 3));
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(x_step_q4 == 16);
+      int16x8_t t1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + src_stride)));
+      s1[0] = vget_low_s16(vextq_s16(t1, t1, 0));
+      s1[1] = vget_low_s16(vextq_s16(t1, t1, 1));
+      s1[2] = vget_low_s16(vextq_s16(t1, t1, 2));
+      s1[3] = vget_low_s16(vextq_s16(t1, t1, 3));
 
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
+      int16x4_t d0 = convolve4_4(s0[0], s0[1], s0[2], s0[3], filter);
+      int16x4_t d1 = convolve4_4(s1[0], s1[1], s1[2], s1[3], filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
 
-  src -= 3;
+      store_u8(dst, dst_stride, d01);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t t0[2], t1[2];
+        int16x8_t s0[4], s1[4];
+
+        t0[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        t0[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + 8)));
+        s0[0] = vextq_s16(t0[0], t0[1], 0);
+        s0[1] = vextq_s16(t0[0], t0[1], 1);
+        s0[2] = vextq_s16(t0[0], t0[1], 2);
+        s0[3] = vextq_s16(t0[0], t0[1], 3);
+
+        t1[0] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride)));
+        t1[1] = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s + src_stride + 8)));
+        s1[0] = vextq_s16(t1[0], t1[1], 0);
+        s1[1] = vextq_s16(t1[0], t1[1], 1);
+        s1[2] = vextq_s16(t1[0], t1[1], 2);
+        s1[3] = vextq_s16(t1[0], t1[1], 3);
+
+        uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter);
+        uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter);
+
+        vst1_u8(d, d0);
+        vst1_u8(d + dst_stride, d1);
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_horiz_neon(const uint8_t *src,
+                                                ptrdiff_t src_stride,
+                                                uint8_t *dst,
+                                                ptrdiff_t dst_stride, int w,
+                                                int h, const int16x8_t filter) {
+  uint8x8_t t0, t1, t2, t3;
 
   if (h == 4) {
     uint8x8_t d01, d23;
-    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
-        d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
-    __builtin_prefetch(src + 0 * src_stride);
-    __builtin_prefetch(src + 1 * src_stride);
-    __builtin_prefetch(src + 2 * src_stride);
-    __builtin_prefetch(src + 3 * src_stride);
-    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     __builtin_prefetch(dst + 0 * dst_stride);
     __builtin_prefetch(dst + 1 * dst_stride);
     __builtin_prefetch(dst + 2 * dst_stride);
@@ -104,36 +133,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
-
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
       transpose_u8_4x4(&d01, &d23);
 
-      vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
-                    vreinterpret_u32_u8(d01), 0);
-      vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
-                    vreinterpret_u32_u8(d23), 0);
-      vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
-                    vreinterpret_u32_u8(d01), 1);
-      vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
-                    vreinterpret_u32_u8(d23), 1);
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -145,13 +160,11 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4;
       dst += 4;
       w -= 4;
-    } while (w > 0);
+    } while (w != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int width;
     const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7;
+    uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
@@ -191,47 +204,27 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
-
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
-        dst += dst_stride;
+        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+        dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
     } else {
       uint8_t *d;
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
       int16x8_t s11, s12, s13, s14;
 
       do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -266,25 +259,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                           filter4);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                           filter4);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                           filter4);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                           filter4);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
-                           filter4);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
-                           filter4);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
-                           filter4);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
-                           filter3, filter4);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
 
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -296,7 +282,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s += 8;
           d += 8;
           width -= 8;
-        } while (width > 0);
+        } while (width != 0);
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
@@ -305,6 +291,33 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements.
+     */
+    const int16x4_t x_filter_4tap = vshr_n_s16(vld1_s16(filter[x0_q4] + 2), 1);
+    vpx_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+                                 x_filter_4tap);
+  } else {
+    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+    vpx_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+                                 x_filter_8tap);
+  }
+}
+
 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *filter, int x0_q4,
@@ -313,8 +326,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
   uint8x8_t t0, t1, t2, t3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -324,31 +337,23 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (h == 4) {
-    uint8x8_t d01, d23;
-    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
-        d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
+    uint8x8_t d01, d23, dd01, dd23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
     __builtin_prefetch(src + 0 * src_stride);
     __builtin_prefetch(src + 1 * src_stride);
     __builtin_prefetch(src + 2 * src_stride);
     __builtin_prefetch(src + 3 * src_stride);
-    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     __builtin_prefetch(dst + 0 * dst_stride);
     __builtin_prefetch(dst + 1 * dst_stride);
     __builtin_prefetch(dst + 2 * dst_stride);
@@ -358,39 +363,28 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
-
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
-
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
       transpose_u8_4x4(&d01, &d23);
 
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+      dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+      dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
 
-      vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -402,18 +396,16 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4;
       dst += 4;
       w -= 4;
-    } while (w > 0);
+    } while (w != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int width;
     const uint8_t *s;
     uint8x8_t t4, t5, t6, t7;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
-      uint32x4_t d0415 = vdupq_n_u32(0);
-      uint32x4_t d2637 = vdupq_n_u32(0);
+      uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
+
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -450,52 +442,35 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
-
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
-        d0415 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
-        d2637 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
-
-        vst1q_lane_u32((uint32_t *)dst, d0415, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 3);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 3);
-        dst += dst_stride;
+        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+        dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+        dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+        dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+        dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
+
+        d04 = vrhadd_u8(d04, dd04);
+        d15 = vrhadd_u8(d15, dd15);
+        d26 = vrhadd_u8(d26, dd26);
+        d37 = vrhadd_u8(d37, dd37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+        dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     } else {
       uint8_t *d;
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
       int16x8_t s11, s12, s13, s14;
-      uint8x16_t d01, d23, d45, d67;
 
       do {
         __builtin_prefetch(src + 0 * src_stride);
@@ -540,41 +515,27 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                           filter4);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                           filter4);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                           filter4);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                           filter4);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
-                           filter4);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
-                           filter4);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
-                           filter4);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
-                           filter3, filter4);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-          d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                            vld1_u8(d + 1 * dst_stride));
-          d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                            vld1_u8(d + 3 * dst_stride));
-          d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
-                            vld1_u8(d + 5 * dst_stride));
-          d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
-                            vld1_u8(d + 7 * dst_stride));
-          d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
-          d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
-          d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
-          d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
-
-          store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
-                       vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
-                       vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+          d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+          d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+          d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+          d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+          d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride));
+          d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride));
+          d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride));
+          d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride));
+
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -586,62 +547,145 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s += 8;
           d += 8;
           width -= 8;
-        } while (width > 0);
+        } while (width != 0);
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     }
   }
 }
 
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+static INLINE void vpx_convolve_4tap_vert_neon(const uint8_t *src,
+                                               ptrdiff_t src_stride,
+                                               uint8_t *dst,
+                                               ptrdiff_t dst_stride, int w,
+                                               int h, const int16x4_t filter) {
+  if (w == 4) {
+    uint8x8_t t0, t1, t2, t3, d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, d0, d1, d2, d3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(y_step_q4 == 16);
+    load_u8_8x3(src, src_stride, &t0, &t1, &t2);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
 
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
+    src += 3 * src_stride;
 
-  src -= 3 * src_stride;
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+
+      d0 = convolve4_4(s0, s1, s2, s3, filter);
+      d1 = convolve4_4(s1, s2, s3, s4, filter);
+      d2 = convolve4_4(s2, s3, s4, s5, filter);
+      d3 = convolve4_4(s3, s4, s5, s6, filter);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    int height;
+    const uint8_t *s;
+    uint8_t *d;
+    uint8x8_t t0, t1, t2, t3, d0, d1, d2, d3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6;
+
+    do {
+      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+      s = src + 3 * src_stride;
+      d = dst;
+      height = h;
+
+      do {
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
+
+        d0 = convolve4_8(s0, s1, s2, s3, filter);
+        d1 = convolve4_8(s1, s2, s3, s4, filter);
+        d2 = convolve4_8(s2, s3, s4, s5, filter);
+        d3 = convolve4_8(s3, s4, s5, s6, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
 
+static INLINE void vpx_convolve_8tap_vert_neon(const uint8_t *src,
+                                               ptrdiff_t src_stride,
+                                               uint8_t *dst,
+                                               ptrdiff_t dst_stride, int w,
+                                               int h, const int16x8_t filter) {
   if (w == 4) {
-    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
-    uint8x8_t d01, d23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
 
     do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
 
       __builtin_prefetch(dst + 0 * dst_stride);
       __builtin_prefetch(dst + 1 * dst_stride);
@@ -651,25 +695,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
-
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -678,15 +713,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
@@ -697,33 +732,26 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 4 * src_stride);
       __builtin_prefetch(src + 5 * src_stride);
       __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
+
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s = src + 7 * src_stride;
       d = dst;
       height = h;
 
       do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
         __builtin_prefetch(d + 0 * dst_stride);
         __builtin_prefetch(d + 1 * dst_stride);
@@ -733,23 +761,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
-
-        vst1_u8(d, t0);
-        d += dst_stride;
-        vst1_u8(d, t1);
-        d += dst_stride;
-        vst1_u8(d, t2);
-        d += dst_stride;
-        vst1_u8(d, t3);
-        d += dst_stride;
+
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -758,12 +776,41 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s4 = s8;
         s5 = s9;
         s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
         height -= 4;
-      } while (height > 0);
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
+  }
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements.
+     */
+    const int16x4_t y_filter_4tap = vshr_n_s16(vld1_s16(filter[y0_q4] + 2), 1);
+    vpx_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride,
+                                w, h, y_filter_4tap);
+  } else {
+    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+    vpx_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
+                                dst_stride, w, h, y_filter_8tap);
   }
 }
 
@@ -774,8 +821,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  int h) {
   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -785,36 +832,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
-    uint8x8_t d01, d23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
-
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
 
     do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
 
       __builtin_prefetch(dst + 0 * dst_stride);
       __builtin_prefetch(dst + 1 * dst_stride);
@@ -824,33 +861,22 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
-
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
-
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
-
-      vst1q_lane_u32((uint32_t *)dst, d0123, 0);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 1);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 2);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 3);
-      dst += dst_stride;
+
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -859,16 +885,15 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
-    uint8x16_t d01, d23, dd01, dd23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
@@ -879,33 +904,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 4 * src_stride);
       __builtin_prefetch(src + 5 * src_stride);
       __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
+
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s = src + 7 * src_stride;
       d = dst;
       height = h;
 
       do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
         __builtin_prefetch(d + 0 * dst_stride);
         __builtin_prefetch(d + 1 * dst_stride);
@@ -915,32 +933,18 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
-
-        d01 = vcombine_u8(t0, t1);
-        d23 = vcombine_u8(t2, t3);
-        dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                           vld1_u8(d + 1 * dst_stride));
-        dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                           vld1_u8(d + 3 * dst_stride));
-        dd01 = vrhaddq_u8(dd01, d01);
-        dd23 = vrhaddq_u8(dd23, d23);
-
-        vst1_u8(d, vget_low_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_low_u8(dd23));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd23));
-        d += dst_stride;
+
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+        d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+        d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+        d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -950,10 +954,12 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5 = s9;
         s6 = s10;
         height -= 4;
-      } while (height > 0);
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index 4f27da9d2f..4ecaee0f99 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -15,70 +15,367 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
 
-static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
-                               uint8x8_t *const s0, uint8x8_t *const s1,
-                               uint8x8_t *const s2, uint8x8_t *const s3) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
-}
-
-static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
-                               uint8x8_t *const s0, uint8x8_t *const s1,
-                               uint8x8_t *const s2, uint8x8_t *const s3,
-                               uint8x8_t *const s4, uint8x8_t *const s5,
-                               uint8x8_t *const s6, uint8x8_t *const s7) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
-  s += p;
-  *s4 = vld1_u8(s);
-  s += p;
-  *s5 = vld1_u8(s);
-  s += p;
-  *s6 = vld1_u8(s);
-  s += p;
-  *s7 = vld1_u8(s);
-}
-
-static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
-                                uint8x16_t *const s0, uint8x16_t *const s1,
-                                uint8x16_t *const s2, uint8x16_t *const s3,
-                                uint8x16_t *const s4, uint8x16_t *const s5,
-                                uint8x16_t *const s6, uint8x16_t *const s7) {
-  *s0 = vld1q_u8(s);
-  s += p;
-  *s1 = vld1q_u8(s);
-  s += p;
-  *s2 = vld1q_u8(s);
-  s += p;
-  *s3 = vld1q_u8(s);
-  s += p;
-  *s4 = vld1q_u8(s);
-  s += p;
-  *s5 = vld1q_u8(s);
-  s += p;
-  *s6 = vld1q_u8(s);
-  s += p;
-  *s7 = vld1q_u8(s);
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
+
+void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h);
+
+static INLINE int16x4_t convolve4_4_sdot_partial(const int8x16_t samples,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  int32x4_t sum = vdotq_lane_s32(correction, samples, filters, 0);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vmovn_s32(sum);
+}
+
+static INLINE int16x4_t convolve4_4_sdot(const uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16_t permute_tbl) {
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  int8x16_t clamped_samples =
+      vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_sdot_partial(const int8x16_t samples_lo,
+                                                 const int8x16_t samples_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  int32x4_t sum0 = vdotq_lane_s32(correction, samples_lo, filters, 0);
+  /* Second 4 output values. */
+  int32x4_t sum1 = vdotq_lane_s32(correction, samples_hi, filters, 0);
+
+  /* Narrow and re-pack. */
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE uint8x8_t convolve4_8_sdot(const uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  int32x4_t sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  /* Second 4 output values. */
+  int32x4_t sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+
+  /* Narrow and re-pack. */
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+                                                 const int8x16_t samples_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE int16x4_t convolve8_4_sdot(const uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+                                                 const int8x16_t samples0_hi,
+                                                 const int8x16_t samples1_lo,
+                                                 const int8x16_t samples1_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(const uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
+
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
+
+void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h);
+
+static INLINE int16x4_t convolve4_4_usdot_partial(const uint8x16_t samples,
+                                                  const int8x8_t filters) {
+  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples, filters, 0);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vmovn_s32(sum);
+}
+
+static INLINE int16x4_t convolve4_4_usdot(const uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16_t permute_tbl) {
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_usdot_partial(const uint8x16_t samples_lo,
+                                                  const uint8x16_t samples_hi,
+                                                  const int8x8_t filters) {
+  /* Sample permutation is performed by the caller. */
+  /* First 4 output values. */
+  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+  /* Second 4 output values. */
+  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples_hi, filters, 0);
+
+  /* Narrow and re-pack. */
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE uint8x8_t convolve4_8_usdot(const uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x2_t permute_tbl) {
+  uint8x16_t permuted_samples[2];
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+  /* First 4 output values. */
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  /* Second 4 output values. */
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+
+  /* Narrow and re-pack. */
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+                                                  const uint8x16_t samples_hi,
+                                                  const int8x8_t filters) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum;
+
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x2_t permute_tbl) {
+  uint8x16_t permuted_samples[2];
+  int32x4_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+                                                  const uint8x16_t samples0_hi,
+                                                  const uint8x16_t samples1_lo,
+                                                  const uint8x16_t samples1_hi,
+                                                  const int8x8_t filters) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x3_t permute_tbl) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int16x4_t convolve4_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t filters) {
+  int16x4_t sum = vmul_lane_s16(s0, filters, 0);
+  sum = vmla_lane_s16(sum, s1, filters, 1);
+  sum = vmla_lane_s16(sum, s2, filters, 2);
+  sum = vmla_lane_s16(sum, s3, filters, 3);
+  return sum;
+}
+
+static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x4_t filters) {
+  int16x8_t sum = vmulq_lane_s16(s0, filters, 0);
+  sum = vmlaq_lane_s16(sum, s1, filters, 1);
+  sum = vmlaq_lane_s16(sum, s2, filters, 2);
+  sum = vmlaq_lane_s16(sum, s3, filters, 3);
+  /* We halved the filter values so -1 from right shift. */
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
                                     const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filters,
-                                    const int16x4_t filter3,
-                                    const int16x4_t filter4) {
+                                    const int16x8_t filters) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
   int16x4_t sum;
@@ -89,8 +386,8 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
   sum = vmla_lane_s16(sum, s5, filters_hi, 1);
   sum = vmla_lane_s16(sum, s6, filters_hi, 2);
   sum = vmla_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
-  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0));
   return sum;
 }
 
@@ -98,9 +395,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
                                     const int16x8_t s2, const int16x8_t s3,
                                     const int16x8_t s4, const int16x8_t s5,
                                     const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filters,
-                                    const int16x8_t filter3,
-                                    const int16x8_t filter4) {
+                                    const int16x8_t filters) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
   int16x8_t sum;
@@ -111,15 +406,13 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
   sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
-  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
-  return vqrshrun_n_s16(sum, 7);
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
                                        const int16x8_t filters) {
-  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
   int16x8_t ss[8];
 
   ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
@@ -132,7 +425,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
   ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
 
   return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
-                     filters, filter3, filter4);
+                     filters);
 }
 
 #endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/vpx_dsp/arm/vpx_convolve8_neon_asm.c
index 4470b28b88..c4177c5385 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_asm.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.c
@@ -35,7 +35,7 @@
     }                                                                        \
   }
 
-DEFINE_FILTER(horiz);
-DEFINE_FILTER(avg_horiz);
-DEFINE_FILTER(vert);
-DEFINE_FILTER(avg_vert);
+DEFINE_FILTER(horiz)
+DEFINE_FILTER(avg_horiz)
+DEFINE_FILTER(vert)
+DEFINE_FILTER(avg_vert)
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/vpx_dsp/arm/vpx_convolve8_neon_asm.h
index b123d1cb08..f1c7d62ed0 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon_asm.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.h
@@ -17,13 +17,13 @@
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
       int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-DECLARE_FILTER(horiz, type1);
-DECLARE_FILTER(avg_horiz, type1);
-DECLARE_FILTER(horiz, type2);
-DECLARE_FILTER(avg_horiz, type2);
-DECLARE_FILTER(vert, type1);
-DECLARE_FILTER(avg_vert, type1);
-DECLARE_FILTER(vert, type2);
-DECLARE_FILTER(avg_vert, type2);
+DECLARE_FILTER(horiz, type1)
+DECLARE_FILTER(avg_horiz, type1)
+DECLARE_FILTER(horiz, type2)
+DECLARE_FILTER(avg_horiz, type2)
+DECLARE_FILTER(vert, type1)
+DECLARE_FILTER(avg_vert, type1)
+DECLARE_FILTER(vert, type2)
+DECLARE_FILTER(avg_vert, type2)
 
 #endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
new file mode 100644
index 0000000000..00bac3b9cf
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -0,0 +1,1117 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE void vpx_convolve_4tap_2d_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x16_t range_limit) {
+  uint8x16_t s0, s1, s2, s3;
+
+  if (w == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      d3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+    d1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+    d2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+      d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+      d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_2d_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x16_t range_limit) {
+  uint8x16_t s0, s1, s2, s3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      d3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+    d1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+    d2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+      d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+      d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h) {
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int32x4_t correction_8tap =
+      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t x_filter_4tap =
+        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
+    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
+    vpx_convolve_4tap_2d_horiz_neon_dotprod(src - 1, src_stride, dst,
+                                            dst_stride, w, h, x_filter_4tap,
+                                            correction_4tap, range_limit);
+
+  } else {
+    vpx_convolve_8tap_2d_horiz_neon_dotprod(src - 3, src_stride, dst,
+                                            dst_stride, w, h, x_filter_8tap,
+                                            correction_8tap, range_limit);
+  }
+}
+
+static INLINE void vpx_convolve_4tap_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x16_t range_limit) {
+  uint8x16_t s0, s1, s2, s3;
+
+  if (w == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve4_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      t1 = convolve4_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      t2 = convolve4_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      t3 = convolve4_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve4_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve4_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve4_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve4_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x16_t range_limit) {
+  uint8x16_t s0, s1, s2, s3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int32x4_t correction_8tap =
+      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(x_filter_8tap, FILTER_BITS)));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t x_filter_4tap =
+        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
+    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
+    vpx_convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride,
+                                         w, h, x_filter_4tap, correction_4tap,
+                                         range_limit);
+
+  } else {
+    vpx_convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride,
+                                         w, h, x_filter_8tap, correction_8tap,
+                                         range_limit);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
+                                          ptrdiff_t src_stride, uint8_t *dst,
+                                          ptrdiff_t dst_stride,
+                                          const InterpKernel *filter, int x0_q4,
+                                          int x_step_q4, int y0_q4,
+                                          int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23, dd01, dd23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b0,
+                                        int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+static INLINE void vpx_convolve_4tap_vert_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x8_t range_limit) {
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      d0 = convolve4_4_sdot_partial(s0123, correction, filter);
+      d1 = convolve4_4_sdot_partial(s1234, correction, filter);
+      d2 = convolve4_4_sdot_partial(s2345, correction, filter);
+      d3 = convolve4_4_sdot_partial(s3456, correction, filter);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s0123 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s1234 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s2345 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        d0 = convolve4_8_sdot_partial(s0123_lo, s0123_hi, correction, filter);
+        d1 = convolve4_8_sdot_partial(s1234_lo, s1234_hi, correction, filter);
+        d2 = convolve4_8_sdot_partial(s2345_lo, s2345_hi, correction, filter);
+        d3 = convolve4_8_sdot_partial(s3456_lo, s3456_hi, correction, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s0123_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s1234_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s2345_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+        s3456_lo = s78910_lo;
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s0123_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s1234_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s2345_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_vert_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter,
+    const int32x4_t correction, const uint8x8_t range_limit) {
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filter);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filter);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filter);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
+                                     int w, int h) {
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int32x4_t correction_8tap =
+      vdupq_n_s32(vaddlvq_s16(vshll_n_s8(y_filter_8tap, FILTER_BITS)));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t y_filter_4tap =
+        vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
+    const int32x4_t correction_4tap = vshrq_n_s32(correction_8tap, 1);
+    vpx_convolve_4tap_vert_neon_dotprod(src - src_stride, src_stride, dst,
+                                        dst_stride, w, h, y_filter_4tap,
+                                        correction_4tap, range_limit);
+  } else {
+    vpx_convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
+                                        dst_stride, w, h, y_filter_8tap,
+                                        correction_8tap, range_limit);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filters);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filters);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filters);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filters);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
new file mode 100644
index 0000000000..bcad1dd121
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -0,0 +1,989 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE void vpx_convolve_4tap_2d_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  uint8x16_t s0, s1, s2, s3;
+
+  if (w == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve4_4_usdot(s0, filter, perm_tbl);
+      d1 = convolve4_4_usdot(s1, filter, perm_tbl);
+      d2 = convolve4_4_usdot(s2, filter, perm_tbl);
+      d3 = convolve4_4_usdot(s3, filter, perm_tbl);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve4_4_usdot(s0, filter, perm_tbl);
+    d1 = convolve4_4_usdot(s1, filter, perm_tbl);
+    d2 = convolve4_4_usdot(s2, filter, perm_tbl);
+    /* We halved the filter values so -1 from right shift. */
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve4_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve4_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve4_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve4_8_usdot(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve4_8_usdot(s0, filter, perm_tbl);
+      d1 = convolve4_8_usdot(s1, filter, perm_tbl);
+      d2 = convolve4_8_usdot(s2, filter, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_2d_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  uint8x16_t s0, s1, s2, s3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve8_4_usdot(s0, filter, perm_tbl);
+      d1 = convolve8_4_usdot(s1, filter, perm_tbl);
+      d2 = convolve8_4_usdot(s2, filter, perm_tbl);
+      d3 = convolve8_4_usdot(s3, filter, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_usdot(s0, filter, perm_tbl);
+    d1 = convolve8_4_usdot(s1, filter, perm_tbl);
+    d2 = convolve8_4_usdot(s2, filter, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+      d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+      d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t x_filter_4tap =
+        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
+    vpx_convolve_4tap_2d_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride,
+                                         w, h, x_filter_4tap);
+
+  } else {
+    vpx_convolve_8tap_2d_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride,
+                                         w, h, x_filter_8tap);
+  }
+}
+
+static INLINE void vpx_convolve_4tap_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  uint8x16_t s0, s1, s2, s3;
+
+  if (w == 4) {
+    const uint8x16_t perm_tbl = vld1q_u8(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve4_4_usdot(s0, filter, perm_tbl);
+      t1 = convolve4_4_usdot(s1, filter, perm_tbl);
+      t2 = convolve4_4_usdot(s2, filter, perm_tbl);
+      t3 = convolve4_4_usdot(s3, filter, perm_tbl);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve4_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve4_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve4_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve4_8_usdot(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  uint8x16_t s0, s1, s2, s3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h) {
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t x_filter_4tap =
+        vext_s8(vshr_n_s8(x_filter_8tap, 1), vdup_n_s8(0), 2);
+    vpx_convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w,
+                                      h, x_filter_4tap);
+
+  } else {
+    vpx_convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w,
+                                      h, x_filter_8tap);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *filter, int x0_q4,
+                                       int x_step_q4, int y0_q4, int y_step_q4,
+                                       int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23, dd01, dd23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b0, uint8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+static INLINE void vpx_convolve_4tap_vert_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      d0 = convolve4_4_usdot_partial(s0123, filter);
+      d1 = convolve4_4_usdot_partial(s1234, filter);
+      d2 = convolve4_4_usdot_partial(s2345, filter);
+      d3 = convolve4_4_usdot_partial(s3456, filter);
+      /* We halved the filter values so -1 from right shift. */
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s0123 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s1234 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s2345 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        d0 = convolve4_8_usdot_partial(s0123_lo, s0123_hi, filter);
+        d1 = convolve4_8_usdot_partial(s1234_lo, s1234_hi, filter);
+        d2 = convolve4_8_usdot_partial(s2345_lo, s2345_hi, filter);
+        d3 = convolve4_8_usdot_partial(s3456_lo, s3456_hi, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s0123_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s1234_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s2345_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        s3456_lo = s78910_lo;
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s0123_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s1234_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s2345_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void vpx_convolve_8tap_vert_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filter);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filter);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filter);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    /* All 4-tap and bilinear filter values are even, so halve them to reduce
+     * intermediate precision requirements. Also slide the filter values so the
+     * the 4 taps exist in the first 4 elements of the vector.
+     */
+    const int8x8_t y_filter_4tap =
+        vext_s8(vshr_n_s8(y_filter_8tap, 1), vdup_n_s8(0), 2);
+    vpx_convolve_4tap_vert_neon_i8mm(src - src_stride, src_stride, dst,
+                                     dst_stride, w, h, y_filter_4tap);
+  } else {
+    vpx_convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
+                                     dst_stride, w, h, y_filter_8tap);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filters);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filters);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filters);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filters);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon.c b/vpx_dsp/arm/vpx_convolve_avg_neon.c
index 07349d03ae..8e3ee599f4 100644
--- a/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -43,7 +43,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // avg8
     uint8x8_t s0, s1, d0, d1;
     uint8x16_t s01, d01;
@@ -64,7 +64,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1_u8(dst, vget_high_u8(d01));
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // avg16
     uint8x16_t s0, s1, d0, d1;
     do {
@@ -83,7 +83,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst, d1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 32) {  // avg32
     uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
     do {
@@ -110,7 +110,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst + 16, d3);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else {  // avg64
     uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
     do {
diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c
index 7abed67a40..bea7c98437 100644
--- a/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -9,6 +9,7 @@
  */
 
 #include <arm_neon.h>
+#include <string.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
@@ -26,14 +27,14 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   if (w < 8) {  // copy4
     do {
-      *(uint32_t *)dst = *(const uint32_t *)src;
+      memcpy(dst, src, 4);
       src += src_stride;
       dst += dst_stride;
-      *(uint32_t *)dst = *(const uint32_t *)src;
+      memcpy(dst, src, 4);
       src += src_stride;
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // copy8
     uint8x8_t s0, s1;
     do {
@@ -47,7 +48,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1_u8(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // copy16
     uint8x16_t s0, s1;
     do {
@@ -61,7 +62,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 32) {  // copy32
     uint8x16_t s0, s1, s2, s3;
     do {
@@ -79,7 +80,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst + 16, s3);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else {  // copy64
     uint8x16_t s0, s1, s2, s3;
     do {
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 830f3176d7..57772ea668 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -12,35 +12,38 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
    * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
    */
   uint8_t temp[64 * 72];
 
-  // Account for the vertical phase needing 3 lines prior and 4 lines post
-  // (+ 1 to make it divisible by 4).
-  const int intermediate_height = h + 8;
+  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+   * and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) */
+  const int intermediate_height = h + vert_filter_taps;
+  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* Filter starting 3 lines back. The neon implementation will ignore the given
-   * height and filter a multiple of 4 lines. Since this goes in to the temp
-   * buffer which has lots of extra room and is subsequently discarded this is
-   * safe if somewhat less than ideal.   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
-                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+  /* Filter starting border_offset lines back. The Neon implementation will
+   * ignore the given height and filter a multiple of 4 lines. Since this goes
+   * in to the temp buffer which has lots of extra room and is subsequently
+   * discarded this is safe if somewhat less than ideal.   */
+  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, temp,
+                           w, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                            intermediate_height);
 
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
-                          x_step_q4, y0_q4, y_step_q4, w, h);
+  /* Step into the temp buffer border_offset lines to get actual frame data. */
+  vpx_convolve8_vert_neon(temp + w * border_offset, w, dst, dst_stride, filter,
+                          x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
diff --git a/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
new file mode 100644
index 0000000000..9d754fde17
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+   * maximum buffer size to 64 * (64 + 7). */
+  uint8_t temp[64 * 71];
+
+  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+   * and vert_filter_taps / 2 lines post. */
+  const int intermediate_height = h + vert_filter_taps - 1;
+  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon_dotprod(
+      src - src_stride * border_offset, src_stride, temp, w, filter, x0_q4,
+      x_step_q4, y0_q4, y_step_q4, w, intermediate_height);
+
+  vpx_convolve8_vert_neon_dotprod(temp + w * border_offset, w, dst, dst_stride,
+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                                  h);
+}
+
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h) {
+  uint8_t temp[64 * 71];
+
+  /* Averaging convolution always uses an 8-tap filter. */
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
+                                      filter, x0_q4, x_step_q4, y0_q4,
+                                      y_step_q4, w, intermediate_height);
+
+  vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
+                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
new file mode 100644
index 0000000000..d7cbb09ea6
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+   * maximum buffer size to 64 * (64 + 7). */
+  uint8_t temp[64 * 71];
+
+  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  /* Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+   * and vert_filter_taps / 2 lines post. */
+  const int intermediate_height = h + vert_filter_taps - 1;
+  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * border_offset, src_stride,
+                                   temp, w, filter, x0_q4, x_step_q4, y0_q4,
+                                   y_step_q4, w, intermediate_height);
+
+  vpx_convolve8_vert_neon_i8mm(temp + w * border_offset, w, dst, dst_stride,
+                               filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                               h);
+}
+
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  uint8_t temp[64 * 71];
+
+  /* Averaging convolution always uses an 8-tap filter. */
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
+                                   filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
+                                   w, intermediate_height);
+
+  vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter,
+                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index 8edf8a66e6..b8e3c5e540 100644
--- a/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_ports/mem.h"
@@ -38,8 +39,6 @@ static INLINE void scaledconvolve_horiz_w4(
         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
         if (x_q4 & SUBPEL_MASK) {
           const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
           uint8x8_t s[8], d;
           int16x8_t ss[4];
           int16x4_t t[8], tt;
@@ -61,7 +60,7 @@ static INLINE void scaledconvolve_horiz_w4(
           t[7] = vget_high_s16(ss[3]);
 
           tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters, filter3, filter4);
+                           filters);
           d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
           vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
         } else {
@@ -167,8 +166,6 @@ static INLINE void scaledconvolve_vert_w4(
 
     if (y_q4 & SUBPEL_MASK) {
       const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
       uint8x8_t s[8], d;
       int16x4_t t[8], tt;
 
@@ -183,8 +180,7 @@ static INLINE void scaledconvolve_vert_w4(
       t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
       t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
 
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
-                       filter3, filter4);
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
       d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
     } else {
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 1c45e8a73d..a8dcab7dae 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include <assert.h>
 #include <stdlib.h>
 
 #include "./vpx_dsp_rtcd.h"
@@ -293,19 +295,19 @@ void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
     vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
   }
 
-  // coeff: 15 bit, dynamic range [-16320, 16320]
+  // coeff: 16 bit, dynamic range [-32768, 32767]
   for (idx = 0; idx < 256; ++idx) {
     tran_low_t a0 = coeff[0];
     tran_low_t a1 = coeff[256];
     tran_low_t a2 = coeff[512];
     tran_low_t a3 = coeff[768];
 
-    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 17 bit, [-65536, 65535]
     tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
-    tran_low_t b2 = (a2 + a3) >> 2;  // [-16320, 16320]
+    tran_low_t b2 = (a2 + a3) >> 2;  // [-16384, 16383]
     tran_low_t b3 = (a2 - a3) >> 2;
 
-    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[0] = b0 + b2;  // 16 bit, [-32768, 32767]
     coeff[256] = b1 + b3;
     coeff[512] = b0 - b2;
     coeff[768] = b1 - b3;
@@ -344,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
+  assert(height >= 2);
   for (idx = 0; idx < 16; ++idx) {
     int i;
     hbuf[idx] = 0;
@@ -425,7 +428,7 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
   int i, j;
   const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
   const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
-  *min = 255;
+  *min = 65535;
   *max = 0;
   for (i = 0; i < 8; ++i, s += p, d += dp) {
     for (j = 0; j < 8; ++j) {
diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h
index 04084af8f2..5f1ee69ec2 100644
--- a/vpx_dsp/bitwriter.h
+++ b/vpx_dsp/bitwriter.h
@@ -13,6 +13,7 @@
 
 #include <stdio.h>
 
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/mem.h"
 
 #include "vpx_dsp/prob.h"
@@ -35,7 +36,9 @@ typedef struct vpx_writer {
 void vpx_start_encode(vpx_writer *br, uint8_t *source);
 void vpx_stop_encode(vpx_writer *br);
 
-static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
+static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br,
+                                                         int bit,
+                                                         int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
diff --git a/vpx_dsp/fastssim.c b/vpx_dsp/fastssim.c
index 6ab6f557e2..4d32a02a55 100644
--- a/vpx_dsp/fastssim.c
+++ b/vpx_dsp/fastssim.c
@@ -47,7 +47,7 @@ struct fs_ctx {
   unsigned *col_buf;
 };
 
-static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
   unsigned char *data;
   size_t data_size;
   int lw;
@@ -71,6 +71,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     lh = (lh + 1) >> 1;
   }
   data = (unsigned char *)malloc(data_size);
+  if (!data) return -1;
   _ctx->level = (fs_level *)data;
   _ctx->nlevels = _nlevels;
   data += _nlevels * sizeof(*_ctx->level);
@@ -95,6 +96,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     lh = (lh + 1) >> 1;
   }
   _ctx->col_buf = (unsigned *)data;
+  return 0;
 }
 
 static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
@@ -456,7 +458,7 @@ static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
   double ret;
   int l;
   ret = 1;
-  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0;
   fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
                        _shift);
   for (l = 0; l < FS_NLEVELS - 1; l++) {
diff --git a/vpx_dsp/loongarch/avg_lsx.c b/vpx_dsp/loongarch/avg_lsx.c
new file mode 100644
index 0000000000..750c9de29f
--- /dev/null
+++ b/vpx_dsp/loongarch/avg_lsx.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h"
+
+void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride,
+                          tran_low_t *dst) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  ptrdiff_t src_stride2 = src_stride << 1;
+  ptrdiff_t src_stride3 = src_stride2 + src_stride;
+  ptrdiff_t src_stride4 = src_stride2 << 1;
+  ptrdiff_t src_stride6 = src_stride3 << 1;
+
+  int16_t *src_tmp = (int16_t *)src;
+  src0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2);
+  src3 = __lsx_vldx(src_tmp, src_stride6);
+  src_tmp += src_stride4;
+  src4 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6);
+  src7 = __lsx_vldx(src_tmp, src_stride6);
+
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  store_tran_low(tmp0, dst, 0);
+  store_tran_low(tmp1, dst, 8);
+  store_tran_low(tmp2, dst, 16);
+  store_tran_low(tmp3, dst, 24);
+  store_tran_low(tmp4, dst, 32);
+  store_tran_low(tmp5, dst, 40);
+  store_tran_low(tmp6, dst, 48);
+  store_tran_low(tmp7, dst, 56);
+}
+
+void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride,
+                            tran_low_t *dst) {
+  int i;
+  __m128i a0, a1, a2, a3, b0, b1, b2, b3;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0);
+  /* Top right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    a0 = load_tran_low(dst);
+    a1 = load_tran_low(dst + 64);
+    a2 = load_tran_low(dst + 128);
+    a3 = load_tran_low(dst + 192);
+
+    LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1);
+    DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3);
+    LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2);
+
+    store_tran_low(a0, dst, 0);
+    store_tran_low(a1, dst, 64);
+    store_tran_low(a2, dst, 128);
+    store_tran_low(a3, dst, 192);
+
+    dst += 8;
+  }
+}
diff --git a/vpx_dsp/loongarch/avg_pred_lsx.c b/vpx_dsp/loongarch/avg_pred_lsx.c
new file mode 100644
index 0000000000..482626080a
--- /dev/null
+++ b/vpx_dsp/loongarch/avg_pred_lsx.c
@@ -0,0 +1,83 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+                           int height, const uint8_t *ref, int ref_stride) {
+  // width > 8 || width == 8 || width == 4
+  if (width > 8) {
+    int i, j;
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        __m128i p, r, avg;
+
+        p = __lsx_vld(pred + j, 0);
+        r = __lsx_vld(ref + j, 0);
+        avg = __lsx_vavgr_bu(p, r);
+        __lsx_vst(avg, comp_pred + j, 0);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    int i = height * width;
+    do {
+      __m128i p, r, r_0, r_1;
+
+      p = __lsx_vld(pred, 0);
+      r_0 = __lsx_vld(ref, 0);
+      ref += ref_stride;
+      r_1 = __lsx_vld(ref, 0);
+      ref += ref_stride;
+      r = __lsx_vilvl_d(r_1, r_0);
+      r = __lsx_vavgr_bu(p, r);
+
+      __lsx_vst(r, comp_pred, 0);
+
+      pred += 16;
+      comp_pred += 16;
+      i -= 16;
+    } while (i);
+  } else {  // width = 4
+    int i = height * width;
+    assert(width == 4);
+    do {
+      __m128i p, r, r_0, r_1, r_2, r_3;
+      p = __lsx_vld(pred, 0);
+
+      if (width == ref_stride) {
+        r = __lsx_vld(ref, 0);
+        ref += 16;
+      } else {
+        r_0 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_1 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_2 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_3 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2);
+        r = __lsx_vilvl_d(r_2, r_0);
+      }
+      r = __lsx_vavgr_bu(p, r);
+
+      __lsx_vst(r, comp_pred, 0);
+      comp_pred += 16;
+      pred += 16;
+      i -= 16;
+    } while (i);
+  }
+}
diff --git a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
new file mode 100644
index 0000000000..b0db1e99c5
--- /dev/null
+++ b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i load_tran_low(const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i v0_m = __lsx_vld(s, 0);
+  __m128i v1_m = __lsx_vld(s + 4, 0);
+  return __lsx_vsrlni_h_w(v0_m, v1_m, 0);
+#else
+  return __lsx_vld(s, 0);
+#endif
+}
+
+static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i v0_m, v1_m;
+  v1_m = __lsx_vexth_w_h(v);
+  v0_m = __lsx_vsllwil_w_h(v, 0);
+  __lsx_vst(v0_m, s + c, 0);
+  __lsx_vst(v1_m, s + c + 4, 0);
+#else
+  __lsx_vst(v, s + c, 0);
+#endif
+}
+
+#endif  // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
new file mode 100644
index 0000000000..9bb3877212
--- /dev/null
+++ b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
@@ -0,0 +1,1176 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+#define UNPCK_SH_SW(in, out0, out1)  \
+  do {                               \
+    out0 = __lsx_vsllwil_w_h(in, 0); \
+    out1 = __lsx_vexth_w_h(in);      \
+  } while (0)
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+                                              int32_t src_stride,
+                                              int16_t *temp_buff) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i step0, step1, step2, step3;
+  __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+  __m128i step0_1, step1_1, step2_1, step3_1;
+
+  int32_t stride = src_stride << 1;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  const int16_t *input_tmp = (int16_t *)input;
+
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+  in3 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in0_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+  in3_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp = input + (src_stride * 24);
+  in4_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+  in7_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+  in7 = __lsx_vldx(input_tmp, stride3);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+            in2_1, in3_1);
+  DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+            in6_1, in7_1);
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+                    step3, in4, in5, in6, in7);
+  LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                    step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+                    in7_1);
+
+  __lsx_vst(step0, temp_buff, 0);
+  __lsx_vst(step1, temp_buff, 16);
+  __lsx_vst(step2, temp_buff, 32);
+  __lsx_vst(step3, temp_buff, 48);
+
+  __lsx_vst(in4, temp_buff, 448);
+  __lsx_vst(in5, temp_buff, 464);
+  __lsx_vst(in6, temp_buff, 480);
+  __lsx_vst(in7, temp_buff, 496);
+
+  __lsx_vst(step0_1, temp_buff, 64);
+  __lsx_vst(step1_1, temp_buff, 80);
+  __lsx_vst(step2_1, temp_buff, 96);
+  __lsx_vst(step3_1, temp_buff, 112);
+
+  __lsx_vst(in4_1, temp_buff, 384);
+  __lsx_vst(in5_1, temp_buff, 400);
+  __lsx_vst(in6_1, temp_buff, 416);
+  __lsx_vst(in7_1, temp_buff, 432);
+
+  /* 3rd and 4th set */
+  input_tmp = input + (src_stride * 8);
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+  in3 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in0_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+  in3_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+  in7_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+  in7 = __lsx_vldx(input_tmp, stride3);
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+            in2_1, in3_1);
+  DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+            in6_1, in7_1);
+
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+                    step3, in4, in5, in6, in7);
+  LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                    step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+                    in7_1);
+
+  __lsx_vst(step0, temp_buff, 128);
+  __lsx_vst(step1, temp_buff, 144);
+  __lsx_vst(step2, temp_buff, 160);
+  __lsx_vst(step3, temp_buff, 176);
+
+  __lsx_vst(in4, temp_buff, 320);
+  __lsx_vst(in5, temp_buff, 336);
+  __lsx_vst(in6, temp_buff, 352);
+  __lsx_vst(in7, temp_buff, 368);
+
+  __lsx_vst(step0_1, temp_buff, 192);
+  __lsx_vst(step1_1, temp_buff, 208);
+  __lsx_vst(step2_1, temp_buff, 224);
+  __lsx_vst(step3_1, temp_buff, 240);
+
+  __lsx_vst(in4_1, temp_buff, 256);
+  __lsx_vst(in5_1, temp_buff, 272);
+  __lsx_vst(in6_1, temp_buff, 288);
+  __lsx_vst(in7_1, temp_buff, 304);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i temp0, temp1;
+
+  /* fdct even */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+            in13, in14, in15);
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1,
+                    vec2, vec3, in12, in13, in14, in15);
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+            in10, in11);
+  LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6,
+                    vec7, in8, in9, in10, in11);
+
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 0);
+  __lsx_vst(temp1, temp, 1024);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 512);
+  __lsx_vst(temp1, temp, 1536);
+
+  DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7,
+            vec6, vec5, vec4);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 256);
+  __lsx_vst(temp1, temp, 1792);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 1280);
+  __lsx_vst(temp1, temp, 768);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 128);
+  __lsx_vst(temp1, temp, 1920);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 1152);
+  __lsx_vst(temp1, temp, 896);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 640);
+  __lsx_vst(temp1, temp, 1408);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 384);
+  __lsx_vst(temp1, temp, 1664);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+  __m128i tmp0, tmp1;
+
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21,
+            in26, in27);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19,
+            in28, in29);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, input, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, input, 80);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, input, 160);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, input, 176);
+
+  in21 = __lsx_vadd_h(in18, in21);
+  in20 = __lsx_vadd_h(in19, in20);
+  in27 = __lsx_vadd_h(in28, in27);
+  in26 = __lsx_vadd_h(in29, in26);
+
+  DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22,
+            in23, in24, in25);
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17,
+            in30, in31);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, input, 32);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, input, 48);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, input, 192);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, input, 208);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 0);
+  __lsx_vst(vec4, temp_ptr, 1920);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 896);
+  __lsx_vst(vec4, temp_ptr, 1024);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec4, temp_ptr, 1408);
+  __lsx_vst(vec5, temp_ptr, 512);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec4, temp_ptr, 384);
+  __lsx_vst(vec5, temp_ptr, 1536);
+
+  DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23,
+            in20, in21);
+  DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26,
+            in27, in24, in25);
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 1664);
+  __lsx_vst(vec4, temp_ptr, 256);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 640);
+  __lsx_vst(vec4, temp_ptr, 1280);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 1152);
+  __lsx_vst(vec4, temp_ptr, 768);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 128);
+  __lsx_vst(vec4, temp_ptr, 1792);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+                               int16_t *tmp_buf, int16_t *tmp_buf_big) {
+  fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+  fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+  fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+                                           int16_t *output) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i step0, step1, step2, step3, step4, step5, step6, step7;
+
+  DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff,
+            192, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384,
+            temp_buff, 448, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff,
+            240, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432,
+            temp_buff, 496, in12, in13, in14, in15);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, step0, step1, step2, step3,
+                     step4, step5, step6, step7, in8, in9, in10, in11, in12,
+                     in13, in14, in15);
+
+  __lsx_vst(step0, output, 0);
+  __lsx_vst(step1, output, 16);
+  __lsx_vst(step2, output, 32);
+  __lsx_vst(step3, output, 48);
+  __lsx_vst(step4, output, 64);
+  __lsx_vst(step5, output, 80);
+  __lsx_vst(step6, output, 96);
+  __lsx_vst(step7, output, 112);
+
+  __lsx_vst(in8, output, 384);
+  __lsx_vst(in9, output, 400);
+  __lsx_vst(in10, output, 416);
+  __lsx_vst(in11, output, 432);
+  __lsx_vst(in12, output, 448);
+  __lsx_vst(in13, output, 464);
+  __lsx_vst(in14, output, 480);
+  __lsx_vst(in15, output, 496);
+
+  /* 2nd set */
+  DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff,
+            208, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400,
+            temp_buff, 464, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff,
+            224, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416,
+            temp_buff, 480, in12, in13, in14, in15);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, step0, step1, step2, step3,
+                     step4, step5, step6, step7, in8, in9, in10, in11, in12,
+                     in13, in14, in15);
+
+  __lsx_vst(step0, output, 128);
+  __lsx_vst(step1, output, 144);
+  __lsx_vst(step2, output, 160);
+  __lsx_vst(step3, output, 176);
+  __lsx_vst(step4, output, 192);
+  __lsx_vst(step5, output, 208);
+  __lsx_vst(step6, output, 224);
+  __lsx_vst(step7, output, 240);
+
+  __lsx_vst(in8, output, 256);
+  __lsx_vst(in9, output, 272);
+  __lsx_vst(in10, output, 288);
+  __lsx_vst(in11, output, 304);
+  __lsx_vst(in12, output, 320);
+  __lsx_vst(in13, output, 336);
+  __lsx_vst(in14, output, 352);
+  __lsx_vst(in15, output, 368);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+                                    int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+  __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+  __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+            in13, in14, in15);
+
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+
+  __lsx_vst(vec0, interm_ptr, 0);
+  __lsx_vst(vec1, interm_ptr, 16);
+  __lsx_vst(vec2, interm_ptr, 32);
+  __lsx_vst(vec3, interm_ptr, 48);
+  __lsx_vst(vec4, interm_ptr, 64);
+  __lsx_vst(vec5, interm_ptr, 80);
+  __lsx_vst(vec6, interm_ptr, 96);
+  __lsx_vst(vec7, interm_ptr, 112);
+
+  __lsx_vst(in8, interm_ptr, 128);
+  __lsx_vst(in9, interm_ptr, 144);
+  __lsx_vst(in10, interm_ptr, 160);
+  __lsx_vst(in11, interm_ptr, 176);
+  __lsx_vst(in12, interm_ptr, 192);
+  __lsx_vst(in13, interm_ptr, 208);
+  __lsx_vst(in14, interm_ptr, 224);
+  __lsx_vst(in15, interm_ptr, 240);
+
+  /* Stage 3 */
+  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+  DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r,
+            vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+  LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r,
+                    vec5_r);
+  DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l,
+            vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r);
+
+  tmp3_w = __lsx_vadd_w(vec0_r, vec3_r);
+  vec0_r = __lsx_vsub_w(vec0_r, vec3_r);
+  vec3_r = __lsx_vadd_w(vec1_r, vec2_r);
+  vec1_r = __lsx_vsub_w(vec1_r, vec2_r);
+
+  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 16);
+
+  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  __lsx_vst(vec5, out, 32);
+  __lsx_vst(vec4, out, 48);
+
+  DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32,
+            interm_ptr, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96,
+            interm_ptr, 112, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 64);
+  __lsx_vst(in5, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 80);
+  __lsx_vst(in5, out, 96);
+
+  DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160,
+            interm_ptr, 176, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224,
+            interm_ptr, 240, in12, in13, in14, in15);
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 128);
+  __lsx_vst(in5, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 144);
+  __lsx_vst(in5, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  tmp0_w = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 160);
+  __lsx_vst(in5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 192);
+  __lsx_vst(in5, out, 176);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+            in7);
+  DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+            in14, in15);
+
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 0);
+  __lsx_vst(temp1, out, 16);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 32);
+  __lsx_vst(temp1, out, 48);
+
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 64);
+  __lsx_vst(temp1, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 80);
+  __lsx_vst(temp1, out, 96);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 128);
+  __lsx_vst(temp1, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 144);
+  __lsx_vst(temp1, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5)
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 160);
+  __lsx_vst(temp1, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 192);
+  __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+                                int16_t *out) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+  __m128i tmp0, tmp1;
+
+  in20 = __lsx_vld(temp, 64);
+  in21 = __lsx_vld(temp, 80);
+  in26 = __lsx_vld(temp, 160);
+  in27 = __lsx_vld(temp, 176);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = __lsx_vld(temp, 32);
+  in19 = __lsx_vld(temp, 48);
+  in28 = __lsx_vld(temp, 192);
+  in29 = __lsx_vld(temp, 208);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, interm_ptr, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, interm_ptr, 176);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, interm_ptr, 112);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, interm_ptr, 128);
+
+  DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+            in20, in27, in26);
+
+  in22 = __lsx_vld(temp, 96);
+  in23 = __lsx_vld(temp, 112);
+  in24 = __lsx_vld(temp, 128);
+  in25 = __lsx_vld(temp, 144);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = __lsx_vld(temp, 0);
+  in17 = __lsx_vld(temp, 16);
+  in30 = __lsx_vld(temp, 224);
+  in31 = __lsx_vld(temp, 240);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, interm_ptr, 80);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, interm_ptr, 96);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, interm_ptr, 144);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 224);
+  __lsx_vst(vec4, out, 16);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 32);
+  __lsx_vst(vec5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 48);
+  __lsx_vst(vec5, out, 192);
+
+  in20 = __lsx_vld(interm_ptr, 64);
+  in21 = __lsx_vld(interm_ptr, 176);
+  in27 = __lsx_vld(interm_ptr, 112);
+  in26 = __lsx_vld(interm_ptr, 128);
+
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = __lsx_vld(interm_ptr, 80);
+  in25 = __lsx_vld(interm_ptr, 96);
+  in24 = __lsx_vld(interm_ptr, 144);
+  in23 = __lsx_vld(interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 64);
+  __lsx_vst(vec4, out, 176);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 80);
+  __lsx_vst(vec4, out, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 144);
+  __lsx_vst(vec4, out, 96);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 112);
+  __lsx_vst(vec5, out, 128);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+  /* 1st set */
+  in0 = __lsx_vld(temp, 0);
+  in4 = __lsx_vld(temp, 64);
+  in2 = __lsx_vld(temp, 128);
+  in6 = __lsx_vld(temp, 192);
+  in1 = __lsx_vld(temp, 256);
+  in7 = __lsx_vld(temp, 304);
+  in3 = __lsx_vld(temp, 384);
+  in5 = __lsx_vld(temp, 432);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+
+  /* 2nd set */
+  in0_1 = __lsx_vld(temp, 32);
+  in1_1 = __lsx_vld(temp, 464);
+  in2_1 = __lsx_vld(temp, 160);
+  in3_1 = __lsx_vld(temp, 336);
+  in4_1 = __lsx_vld(temp, 96);
+  in5_1 = __lsx_vld(temp, 352);
+  in6_1 = __lsx_vld(temp, 224);
+  in7_1 = __lsx_vld(temp, 480);
+
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in1, output, 64);
+  __lsx_vst(in2, output, 128);
+  __lsx_vst(in3, output, 192);
+  __lsx_vst(in4, output, 256);
+  __lsx_vst(in5, output, 320);
+  __lsx_vst(in6, output, 384);
+  __lsx_vst(in7, output, 448);
+
+  LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  /* 3rd set */
+  in0 = __lsx_vld(temp, 16);
+  in1 = __lsx_vld(temp, 272);
+  in2 = __lsx_vld(temp, 144);
+  in3 = __lsx_vld(temp, 400);
+  in4 = __lsx_vld(temp, 80);
+  in5 = __lsx_vld(temp, 416);
+  in6 = __lsx_vld(temp, 208);
+  in7 = __lsx_vld(temp, 288);
+
+  __lsx_vst(in0_1, output, 16);
+  __lsx_vst(in1_1, output, 80);
+  __lsx_vst(in2_1, output, 144);
+  __lsx_vst(in3_1, output, 208);
+  __lsx_vst(in4_1, output, 272);
+  __lsx_vst(in5_1, output, 336);
+  __lsx_vst(in6_1, output, 400);
+  __lsx_vst(in7_1, output, 464);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(in1, output, 96);
+  __lsx_vst(in2, output, 160);
+  __lsx_vst(in3, output, 224);
+  __lsx_vst(in4, output, 288);
+  __lsx_vst(in5, output, 352);
+  __lsx_vst(in6, output, 416);
+  __lsx_vst(in7, output, 480);
+
+  /* 4th set */
+  in0_1 = __lsx_vld(temp, 48);
+  in1_1 = __lsx_vld(temp, 448);
+  in2_1 = __lsx_vld(temp, 176);
+  in3_1 = __lsx_vld(temp, 320);
+  in4_1 = __lsx_vld(temp, 112);
+  in5_1 = __lsx_vld(temp, 368);
+  in6_1 = __lsx_vld(temp, 240);
+  in7_1 = __lsx_vld(temp, 496);
+
+  LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  __lsx_vst(in0_1, output, 48);
+  __lsx_vst(in1_1, output, 112);
+  __lsx_vst(in2_1, output, 176);
+  __lsx_vst(in3_1, output, 240);
+  __lsx_vst(in4_1, output, 304);
+  __lsx_vst(in5_1, output, 368);
+  __lsx_vst(in6_1, output, 432);
+  __lsx_vst(in7_1, output, 496);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+  fdct8x32_1d_row_even(temp_buf, temp_buf);
+  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+  fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+                       tmp_buf_big + (8 * i));
+  }
+
+  /* row transform */
+  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+  /* row transform */
+  for (i = 1; i < 4; ++i) {
+    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+  }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+            in7);
+  DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+            in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+
+  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+
+  temp0 = __lsx_vadd_h(in0, in3);
+  in0 = __lsx_vsub_h(in0, in3);
+  in3 = __lsx_vadd_h(in1, in2);
+  in1 = __lsx_vsub_h(in1, in2);
+
+  DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+  __lsx_vst(temp0, out, 0);
+  __lsx_vst(temp1, out, 16);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  __lsx_vst(temp0, out, 32);
+  __lsx_vst(temp1, out, 48);
+
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  __lsx_vst(temp0, out, 64);
+  __lsx_vst(temp1, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  __lsx_vst(temp0, out, 80);
+  __lsx_vst(temp1, out, 96);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  __lsx_vst(temp0, out, 128);
+  __lsx_vst(temp1, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  __lsx_vst(temp0, out, 144);
+  __lsx_vst(temp1, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  __lsx_vst(temp0, out, 160);
+  __lsx_vst(temp1, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  __lsx_vst(temp0, out, 192);
+  __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+                                   int16_t *out) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31;
+  __m128i vec4, vec5, tmp0, tmp1;
+
+  in20 = __lsx_vld(temp, 64);
+  in21 = __lsx_vld(temp, 80);
+  in26 = __lsx_vld(temp, 160);
+  in27 = __lsx_vld(temp, 176);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  FDCT_POSTPROC_2V_NEG_H(in20, in21);
+  FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+  in18 = __lsx_vld(temp, 32);
+  in19 = __lsx_vld(temp, 48);
+  in28 = __lsx_vld(temp, 192);
+  in29 = __lsx_vld(temp, 208);
+
+  FDCT_POSTPROC_2V_NEG_H(in18, in19);
+  FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, interm_ptr, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, interm_ptr, 176);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, interm_ptr, 128);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, interm_ptr, 112);
+
+  DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+            in20, in27, in26);
+
+  in22 = __lsx_vld(temp, 96);
+  in23 = __lsx_vld(temp, 112);
+  in24 = __lsx_vld(temp, 128);
+  in25 = __lsx_vld(temp, 144);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+  FDCT_POSTPROC_2V_NEG_H(in22, in23);
+  FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+  in16 = __lsx_vld(temp, 0);
+  in17 = __lsx_vld(temp, 16);
+  in30 = __lsx_vld(temp, 224);
+  in31 = __lsx_vld(temp, 240);
+
+  FDCT_POSTPROC_2V_NEG_H(in16, in17);
+  FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, interm_ptr, 80);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, interm_ptr, 96);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, interm_ptr, 144);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  __lsx_vst(vec5, out, 224);
+  __lsx_vst(vec4, out, 16);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  __lsx_vst(vec4, out, 32);
+  __lsx_vst(vec5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  __lsx_vst(vec4, out, 48);
+  __lsx_vst(vec5, out, 192);
+
+  in20 = __lsx_vld(interm_ptr, 64);
+  in21 = __lsx_vld(interm_ptr, 176);
+  in27 = __lsx_vld(interm_ptr, 112);
+  in26 = __lsx_vld(interm_ptr, 128);
+
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = __lsx_vld(interm_ptr, 80);
+  in25 = __lsx_vld(interm_ptr, 96);
+  in24 = __lsx_vld(interm_ptr, 144);
+  in23 = __lsx_vld(interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  in16 = __lsx_vadd_h(in28, in29);
+  in19 = __lsx_vadd_h(in31, in30);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  __lsx_vst(vec5, out, 64);
+  __lsx_vst(vec4, out, 176);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  __lsx_vst(vec5, out, 80);
+  __lsx_vst(vec4, out, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  __lsx_vst(vec5, out, 144);
+  __lsx_vst(vec4, out, 96);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  __lsx_vst(vec4, out, 112);
+  __lsx_vst(vec5, out, 128);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+  fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out,
+                          int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+                       &tmp_buf_big[0] + (8 * i));
+  }
+  /* row transform */
+  for (i = 0; i < 4; ++i) {
+    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+                       out + (8 * i * 32));
+  }
+}
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c
new file mode 100644
index 0000000000..508532b9d8
--- /dev/null
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -0,0 +1,350 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  do {                                                                         \
+    __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
+                                                                               \
+    DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
+    DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
+    _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
+    _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
+    DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
+    DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
+  } while (0)
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride) {
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+  __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+  __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
+  __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
+  __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+  int32_t src_stride8 = src_stride4 << 1;
+  int16_t *input_tmp = (int16_t *)input;
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
+  input_tmp += src_stride4;
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
+  input_tmp += src_stride4;
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
+            in12);
+  input_tmp += src_stride4;
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
+            in14);
+  input_tmp += src_stride2;
+  in15 = __lsx_vldx(input_tmp, src_stride2);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+            in15);
+  DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
+            tmp6, tmp7);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  __lsx_vst(tmp0, tmp_ptr, 0);
+  __lsx_vst(tmp1, tmp_ptr, 64);
+  __lsx_vst(tmp2, tmp_ptr, 128);
+  __lsx_vst(tmp3, tmp_ptr, 192);
+  __lsx_vst(tmp4, tmp_ptr, 256);
+  __lsx_vst(tmp5, tmp_ptr, 320);
+  __lsx_vst(tmp6, tmp_ptr, 384);
+  __lsx_vst(tmp7, tmp_ptr, 448);
+  DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
+            in14, in13, in12);
+  DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
+            in9, in8);
+
+  tmp_ptr += 16;
+
+  /* stp 1 */
+  DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
+  DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);
+
+  cnst4 = __lsx_vreplvei_h(coeff, 0);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);
+
+  cnst5 = __lsx_vreplvei_h(coeff, 1);
+  cnst5 = __lsx_vpackev_h(cnst5, cnst4);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);
+
+  /* stp2 */
+  LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+  LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+  DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
+  DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);
+
+  cnst0 = __lsx_vreplvei_h(coeff, 4);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);
+
+  LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+  vec1 = __lsx_vilvl_h(in15, in8);
+  vec0 = __lsx_vilvh_h(in15, in8);
+
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 0);
+
+  cnst0 = __lsx_vreplvei_h(coeff2, 0);
+  cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 448);
+
+  vec1 = __lsx_vilvl_h(in14, in9);
+  vec0 = __lsx_vilvh_h(in14, in9);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+  __lsx_vst(in8, tmp_ptr, 256);
+
+  cnst1 = __lsx_vreplvei_h(coeff2, 2);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 192);
+
+  DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);
+
+  cnst1 = __lsx_vreplvei_h(coeff, 3);
+  cnst1 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);
+
+  /* stp4 */
+  DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);
+
+  vec1 = __lsx_vilvl_h(in13, in10);
+  vec0 = __lsx_vilvh_h(in13, in10);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 128);
+
+  cnst0 = __lsx_vreplvei_h(coeff2, 1);
+  cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 320);
+
+  DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
+  vec1 = __lsx_vilvl_h(in12, in11);
+  vec0 = __lsx_vilvh_h(in12, in11);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+  __lsx_vst(in8, tmp_ptr, 384);
+
+  cnst1 = __lsx_vreplvei_h(coeff2, 3);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 64);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  int16_t *input_tmp = input;
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
+            112, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
+            input_tmp, 240, in12, in13, in14, in15);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
+            in14, in15);
+
+  DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+            in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
+                     tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+  __lsx_vst(in8, input, 0);
+  __lsx_vst(in9, input, 32);
+  __lsx_vst(in10, input, 64);
+  __lsx_vst(in11, input, 96);
+  __lsx_vst(in12, input, 128);
+  __lsx_vst(in13, input, 160);
+  __lsx_vst(in14, input, 192);
+  __lsx_vst(in15, input, 224);
+
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
+            in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+               in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+                     tmp1, in1, tmp2, in2, tmp3, in3);
+  __lsx_vst(tmp0, output, 0);
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(tmp1, output, 64);
+  __lsx_vst(in1, output, 96);
+  __lsx_vst(tmp2, output, 128);
+  __lsx_vst(in2, output, 160);
+  __lsx_vst(tmp3, output, 192);
+  __lsx_vst(in3, output, 224);
+
+  LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+                     tmp5, in5, tmp6, in6, tmp7, in7);
+  __lsx_vst(tmp4, output, 16);
+  __lsx_vst(in4, output, 48);
+  __lsx_vst(tmp5, output, 80);
+  __lsx_vst(in5, output, 112);
+  __lsx_vst(tmp6, output, 144);
+  __lsx_vst(in6, output, 176);
+  __lsx_vst(tmp7, output, 208);
+  __lsx_vst(in7, output, 240);
+}
+
+void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  __m128i in0, in1, in2, in3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
+  in3 = __lsx_vldx(input, src_stride6);
+
+  /* fdct4 pre-process */
+  {
+    __m128i vec, mask;
+    __m128i zero = __lsx_vldi(0);
+
+    mask = __lsx_vinsgr2vr_b(zero, 1, 0);
+    DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
+              in3);
+    vec = __lsx_vseqi_h(in0, 0);
+    vec = __lsx_vxori_b(vec, 255);
+    vec = __lsx_vand_v(mask, vec);
+    in0 = __lsx_vadd_h(in0, vec);
+  }
+
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in2, output, 16);
+}
+
+void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+  int16_t *input_tmp = (int16_t *)input;
+
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
+            in2);
+  in3 = __lsx_vldx(input_tmp, src_stride6);
+  input_tmp += src_stride4;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
+            in6);
+  in7 = __lsx_vldx(input_tmp, src_stride6);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in1, output, 16);
+  __lsx_vst(in2, output, 32);
+  __lsx_vst(in3, output, 48);
+  __lsx_vst(in4, output, 64);
+  __lsx_vst(in5, output, 80);
+  __lsx_vst(in6, output, 96);
+  __lsx_vst(in7, output, 112);
+}
+
+void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+  /* column transform */
+  for (i = 0; i < 2; ++i) {
+    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+  }
+
+  /* row transform */
+  for (i = 0; i < 2; ++i) {
+    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+  }
+}
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h
new file mode 100644
index 0000000000..4a9fce9a3d
--- /dev/null
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -0,0 +1,381 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+
+#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                 \
+  do {                                                                        \
+    __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m;                               \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                   \
+    __m128i vec4_m, vec5_m, vec6_m, vec7_m;                                   \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df };             \
+                                                                              \
+    LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);    \
+    DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m);    \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m);    \
+    cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m);                              \
+    vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
+                                                                              \
+    vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m);                                 \
+    cnst2_m = __lsx_vreplvei_h(coeff_m, 2);                                   \
+    cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m);                              \
+    vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
+                                                                              \
+    DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m,     \
+              vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
+              vec7_m, DCT_CONST_BITS, out0, out2, out1, out3);                \
+  } while (0)
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                  out3, out4, out5, out6, out7)                             \
+  do {                                                                      \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                       \
+    __m128i s7_m, x0_m, x1_m, x2_m, x3_m;                                   \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };           \
+                                                                            \
+    /* FDCT stage1 */                                                       \
+    LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m,   \
+                      s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                  \
+    LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);      \
+    DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);           \
+    DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);           \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);        \
+    x1_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                          \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);        \
+    x2_m = __lsx_vneg_h(x2_m);                                              \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                          \
+                                                                            \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 2);                                    \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                          \
+                                                                            \
+    /* stage2 */                                                            \
+    s1_m = __lsx_vilvl_h(s5_m, s6_m);                                       \
+    s0_m = __lsx_vilvh_h(s5_m, s6_m);                                       \
+                                                                            \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                          \
+                                                                            \
+    /* stage3 */                                                            \
+    LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);      \
+                                                                            \
+    /* stage4 */                                                            \
+    DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);           \
+    DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);           \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);        \
+    x1_m = __lsx_vpackev_h(x0_m, x1_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                          \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);        \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                          \
+                                                                            \
+    x1_m = __lsx_vreplvei_h(coeff_m, 5);                                    \
+    x0_m = __lsx_vneg_h(x0_m);                                              \
+    x0_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 6);                                    \
+    x3_m = __lsx_vneg_h(x3_m);                                              \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                          \
+  } while (0)
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)             \
+  do {                                                                      \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+                                                                            \
+    DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m,    \
+              vec1_m, vec2_m, vec3_m);                                      \
+    DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m,    \
+              vec5_m, vec6_m, vec7_m);                                      \
+    DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m,  \
+              in3, in0, in1, in2, in3);                                     \
+    DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m,  \
+              in7, in4, in5, in6, in7);                                     \
+  } while (0)
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
+  do {                                       \
+    __m128i tp0_m, tp1_m;                    \
+    __m128i one = __lsx_vreplgr2vr_h(1);     \
+                                             \
+    tp0_m = __lsx_vslei_h(vec0, 0);          \
+    tp1_m = __lsx_vslei_h(vec1, 0);          \
+    tp0_m = __lsx_vxori_b(tp0_m, 255);       \
+    tp1_m = __lsx_vxori_b(tp1_m, 255);       \
+    vec0 = __lsx_vadd_h(vec0, one);          \
+    vec1 = __lsx_vadd_h(vec1, one);          \
+    tp0_m = __lsx_vand_v(one, tp0_m);        \
+    tp1_m = __lsx_vand_v(one, tp1_m);        \
+    vec0 = __lsx_vadd_h(vec0, tp0_m);        \
+    vec1 = __lsx_vadd_h(vec1, tp1_m);        \
+    vec0 = __lsx_vsrai_h(vec0, 2);           \
+    vec1 = __lsx_vsrai_h(vec1, 2);           \
+  } while (0)
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+  do {                                     \
+    __m128i tp0_m, tp1_m;                  \
+    __m128i one_m = __lsx_vldi(0x401);     \
+                                           \
+    tp0_m = __lsx_vslti_h(vec0, 0);        \
+    tp1_m = __lsx_vslti_h(vec1, 0);        \
+    vec0 = __lsx_vadd_h(vec0, one_m);      \
+    vec1 = __lsx_vadd_h(vec1, one_m);      \
+    tp0_m = __lsx_vand_v(one_m, tp0_m);    \
+    tp1_m = __lsx_vand_v(one_m, tp1_m);    \
+    vec0 = __lsx_vadd_h(vec0, tp0_m);      \
+    vec1 = __lsx_vadd_h(vec1, tp1_m);      \
+    vec0 = __lsx_vsrai_h(vec0, 2);         \
+    vec1 = __lsx_vsrai_h(vec1, 2);         \
+  } while (0)
+
+#define FDCT32_POSTPROC_NEG_W(vec)         \
+  do {                                     \
+    __m128i temp_m;                        \
+    __m128i one_m = __lsx_vreplgr2vr_w(1); \
+                                           \
+    temp_m = __lsx_vslti_w(vec, 0);        \
+    vec = __lsx_vadd_w(vec, one_m);        \
+    temp_m = __lsx_vand_v(one_m, temp_m);  \
+    vec = __lsx_vadd_w(vec, temp_m);       \
+    vec = __lsx_vsrai_w(vec, 2);           \
+  } while (0)
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right,       \
+                          const0, const1, out0, out1, out2, out3)             \
+  do {                                                                        \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                   \
+    __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1;                         \
+    __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0);                       \
+                                                                              \
+    s0_m = __lsx_vreplgr2vr_w((int32_t)const1);                               \
+    k0_m = __lsx_vpackev_w(s0_m, k0_m);                                       \
+                                                                              \
+    DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1);             \
+    s1_m = __lsx_vilvl_w(_tmp0, reg0_left);                                   \
+    s0_m = __lsx_vilvh_w(_tmp0, reg0_left);                                   \
+    s3_m = __lsx_vilvl_w(reg0_left, reg1_left);                               \
+    s2_m = __lsx_vilvh_w(reg0_left, reg1_left);                               \
+    s5_m = __lsx_vilvl_w(_tmp1, reg0_right);                                  \
+    s4_m = __lsx_vilvh_w(_tmp1, reg0_right);                                  \
+    s7_m = __lsx_vilvl_w(reg0_right, reg1_right);                             \
+    s6_m = __lsx_vilvh_w(reg0_right, reg1_right);                             \
+    DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m);          \
+    DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m);          \
+    DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+              DCT_CONST_BITS, out0, out1);                                    \
+    DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m);          \
+    DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m);          \
+    DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+              DCT_CONST_BITS, out2, out3);                                    \
+  } while (0)
+
+#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2,   \
+                            in3)                                               \
+  do {                                                                         \
+    __m128i dst0_m, dst1_m, dst2_m, dst3_m;                                    \
+    __m128i tmp0_m, tmp1_m;                                                    \
+    __m128i res0_m, res1_m, res2_m, res3_m;                                    \
+                                                                               \
+    dst0_m = __lsx_vld(dst, 0);                                                \
+    DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m);        \
+    dst3_m = __lsx_vldx(dst, _stride3);                                        \
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \
+              res0_m, res1_m, res2_m, res3_m);                                 \
+    DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m,     \
+              in3, res0_m, res1_m, res2_m, res3_m);                            \
+    DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0,       \
+              tmp0_m, tmp1_m);                                                 \
+    __lsx_vstelm_d(tmp0_m, dst, 0, 0);                                         \
+    __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1);                               \
+    __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0);                              \
+    __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1);                              \
+  } while (0)
+
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                      out2, out3, out4, out5, out6, out7)                 \
+  do {                                                                    \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
+    __m128i x0_m, x1_m, x2_m, x3_m;                                       \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };         \
+                                                                          \
+    /* FDCT stage1 */                                                     \
+    LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+                      s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                \
+    LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);    \
+    DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);         \
+    DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);         \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);      \
+    x1_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                        \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);      \
+    x2_m = __lsx_vneg_h(x2_m);                                            \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                        \
+                                                                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                        \
+    x2_m = __lsx_vreplvei_h(coeff_m, 2);                                  \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                        \
+                                                                          \
+    /* stage2 */                                                          \
+    s1_m = __lsx_vilvl_h(s5_m, s6_m);                                     \
+    s0_m = __lsx_vilvh_h(s5_m, s6_m);                                     \
+                                                                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                        \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                        \
+                                                                          \
+    /* stage3 */                                                          \
+    LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);    \
+                                                                          \
+    /* stage4 */                                                          \
+    DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);         \
+    DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);         \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);      \
+    x1_m = __lsx_vpackev_h(x0_m, x1_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                        \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);      \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                        \
+                                                                          \
+    x1_m = __lsx_vreplvei_h(coeff_m, 5);                                  \
+    x0_m = __lsx_vneg_h(x0_m);                                            \
+    x0_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                        \
+                                                                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 6);                                  \
+    x3_m = __lsx_vneg_h(x3_m);                                            \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                        \
+  } while (0)
+
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,  \
+                     input7, out1, out3, out5, out7, out9, out11, out13,      \
+                     out15)                                                   \
+  do {                                                                        \
+    __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;             \
+    __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;             \
+    __m128i stp36_m, stp37_m, vec0_m, vec1_m;                                 \
+    __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                           \
+    __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m;                               \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };             \
+    __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };            \
+    __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 };                           \
+                                                                              \
+    /* stp 1 */                                                               \
+    DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \
+    DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \
+                                                                              \
+    cnst4_m = __lsx_vreplvei_h(coeff_m, 0);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m);                  \
+                                                                              \
+    cnst5_m = __lsx_vreplvei_h(coeff_m, 1);                                   \
+    cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m);                  \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m);                  \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m);                  \
+                                                                              \
+    /* stp2 */                                                                \
+    LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m,     \
+                      stp32_m, stp33_m);                                      \
+    LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m,     \
+                      stp35_m, stp34_m);                                      \
+                                                                              \
+    DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m,      \
+              vec4_m);                                                        \
+    DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m,      \
+              vec5_m);                                                        \
+                                                                              \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m);    \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m);                  \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff_m, 4);                                   \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m);                  \
+                                                                              \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m);    \
+    cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m);                  \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff_m, 3);                                   \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m);                  \
+                                                                              \
+    /* stp4 */                                                                \
+    LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m,     \
+                      vec4_m, vec5_m);                                        \
+    LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m,   \
+                      stp24_m, stp31_m);                                      \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(vec2_m, vec6_m);                                   \
+    vec0_m = __lsx_vilvh_h(vec2_m, vec6_m);                                   \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m);  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1);                     \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff2_m, 0);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15);                    \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(vec4_m, vec5_m);                                   \
+    vec0_m = __lsx_vilvh_h(vec4_m, vec5_m);                                   \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m);  \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9);                     \
+                                                                              \
+    cnst1_m = __lsx_vreplvei_h(coeff2_m, 2);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7);                     \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(stp23_m, stp21_m);                                 \
+    vec0_m = __lsx_vilvh_h(stp23_m, stp21_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m);  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5);                     \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff2_m, 1);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11);                    \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(stp24_m, stp31_m);                                 \
+    vec0_m = __lsx_vilvh_h(stp24_m, stp31_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m);  \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13);                    \
+                                                                              \
+    cnst1_m = __lsx_vreplvei_h(coeff2_m, 3);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3);                     \
+  } while (0)
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
+#endif  // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
diff --git a/vpx_dsp/loongarch/idct32x32_lsx.c b/vpx_dsp/loongarch/idct32x32_lsx.c
new file mode 100644
index 0000000000..ec07f57d90
--- /dev/null
+++ b/vpx_dsp/loongarch/idct32x32_lsx.c
@@ -0,0 +1,834 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define UNPCK_UB_SH(_in, _out0, _out1)   \
+  do {                                   \
+    _out0 = __lsx_vsllwil_hu_bu(_in, 0); \
+    _out1 = __lsx_vexth_hu_bu(_in);      \
+  } while (0)
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+                                         int16_t *tmp_buf) {
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* 1st & 2nd 8x8 */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1,
+            n1);
+  DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2,
+            m3, n3);
+  DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5,
+            n5);
+  DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6,
+            m7, n7);
+
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+
+  __lsx_vst(m0, tmp_buf, 0);
+  __lsx_vst(n0, tmp_buf, 16);
+  __lsx_vst(m1, tmp_buf, 32);
+  __lsx_vst(n1, tmp_buf, 48);
+  __lsx_vst(m2, tmp_buf, 64);
+  __lsx_vst(n2, tmp_buf, 80);
+  __lsx_vst(m3, tmp_buf, 96);
+  __lsx_vst(n3, tmp_buf, 112);
+  __lsx_vst(m4, tmp_buf, 128);
+  __lsx_vst(n4, tmp_buf, 144);
+  __lsx_vst(m5, tmp_buf, 160);
+  __lsx_vst(n5, tmp_buf, 176);
+  __lsx_vst(m6, tmp_buf, 192);
+  __lsx_vst(n6, tmp_buf, 208);
+  __lsx_vst(m7, tmp_buf, 224);
+  __lsx_vst(n7, tmp_buf, 240);
+
+  /* 3rd & 4th 8x8 */
+  DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1,
+            n1);
+  DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2,
+            m3, n3);
+  DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4,
+            m5, n5);
+  DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6,
+            m7, n7);
+
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+
+  __lsx_vst(m0, tmp_buf, 256);
+  __lsx_vst(n0, tmp_buf, 272);
+  __lsx_vst(m1, tmp_buf, 288);
+  __lsx_vst(n1, tmp_buf, 304);
+  __lsx_vst(m2, tmp_buf, 320);
+  __lsx_vst(n2, tmp_buf, 336);
+  __lsx_vst(m3, tmp_buf, 352);
+  __lsx_vst(n3, tmp_buf, 368);
+  __lsx_vst(m4, tmp_buf, 384);
+  __lsx_vst(n4, tmp_buf, 400);
+  __lsx_vst(m5, tmp_buf, 416);
+  __lsx_vst(n5, tmp_buf, 432);
+  __lsx_vst(m6, tmp_buf, 448);
+  __lsx_vst(n6, tmp_buf, 464);
+  __lsx_vst(m7, tmp_buf, 480);
+  __lsx_vst(n7, tmp_buf, 496);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+                                            int16_t *tmp_eve_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+  __m128i tmp0;
+
+  /* Even stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480,
+            reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = __lsx_vadd_h(reg0, reg4);
+  reg0 = __lsx_vsub_h(reg0, reg4);
+  reg4 = __lsx_vadd_h(reg6, reg2);
+  reg6 = __lsx_vsub_h(reg6, reg2);
+  reg2 = __lsx_vadd_h(reg1, reg5);
+  reg1 = __lsx_vsub_h(reg1, reg5);
+  reg5 = __lsx_vadd_h(reg7, reg3);
+  reg7 = __lsx_vsub_h(reg7, reg3);
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = __lsx_vadd_h(reg3, reg4);
+  reg3 = __lsx_vsub_h(reg3, reg4);
+  reg4 = __lsx_vsub_h(reg5, vec1);
+  reg5 = __lsx_vadd_h(reg5, vec1);
+
+  tmp0 = __lsx_vneg_h(reg6);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = __lsx_vsub_h(reg0, reg6);
+  reg0 = __lsx_vadd_h(reg0, reg6);
+  vec1 = __lsx_vsub_h(reg7, reg1);
+  reg7 = __lsx_vadd_h(reg7, reg1);
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 240);
+  __lsx_vst(loc1, tmp_eve_buf, 0);
+  __lsx_vst(loc2, tmp_eve_buf, 224);
+  __lsx_vst(loc3, tmp_eve_buf, 16);
+
+  LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 208);
+  __lsx_vst(loc1, tmp_eve_buf, 32);
+  __lsx_vst(loc2, tmp_eve_buf, 192);
+  __lsx_vst(loc3, tmp_eve_buf, 48);
+
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 176);
+  __lsx_vst(loc1, tmp_eve_buf, 64);
+  __lsx_vst(loc2, tmp_eve_buf, 160);
+  __lsx_vst(loc3, tmp_eve_buf, 80);
+
+  LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 144);
+  __lsx_vst(loc1, tmp_eve_buf, 96);
+  __lsx_vst(loc2, tmp_eve_buf, 128);
+  __lsx_vst(loc3, tmp_eve_buf, 112);
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+                                           int16_t *tmp_odd_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = __lsx_vadd_h(reg0, reg3);
+  reg0 = __lsx_vsub_h(reg0, reg3);
+  reg3 = __lsx_vadd_h(reg7, reg4);
+  reg7 = __lsx_vsub_h(reg7, reg4);
+  reg4 = __lsx_vadd_h(reg1, reg2);
+  reg1 = __lsx_vsub_h(reg1, reg2);
+  reg2 = __lsx_vadd_h(reg6, reg5);
+  reg6 = __lsx_vsub_h(reg6, reg5);
+  reg5 = vec0;
+
+  /* 4 Stores */
+  DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 64);
+  __lsx_vst(vec1, tmp_odd_buf, 80);
+
+  DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 0);
+  __lsx_vst(vec1, tmp_odd_buf, 16);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 96);
+  __lsx_vst(vec1, tmp_odd_buf, 112);
+
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  __lsx_vst(vec2, tmp_odd_buf, 32);
+  __lsx_vst(vec3, tmp_odd_buf, 48);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+            vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+  LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 192);
+  __lsx_vst(vec1, tmp_odd_buf, 240);
+
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 160);
+  __lsx_vst(vec1, tmp_odd_buf, 176);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1,
+            vec2, vec0, vec3);
+  LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  __lsx_vst(reg0, tmp_odd_buf, 208);
+  __lsx_vst(reg1, tmp_odd_buf, 224);
+
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  __lsx_vst(reg0, tmp_odd_buf, 128);
+  __lsx_vst(reg1, tmp_odd_buf, 144);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+            tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+            tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 0);
+  __lsx_vst(loc1, tmp_odd_buf, 16);
+  __lsx_vst(loc2, tmp_odd_buf, 32);
+  __lsx_vst(loc3, tmp_odd_buf, 48);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 128);
+  __lsx_vst(loc1, tmp_odd_buf, 144);
+  __lsx_vst(loc2, tmp_odd_buf, 160);
+  __lsx_vst(loc3, tmp_odd_buf, 176);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+            tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+            tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 64);
+  __lsx_vst(loc1, tmp_odd_buf, 80);
+  __lsx_vst(loc2, tmp_odd_buf, 96);
+  __lsx_vst(loc3, tmp_odd_buf, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 192);
+  __lsx_vst(loc1, tmp_odd_buf, 208);
+  __lsx_vst(loc2, tmp_odd_buf, 224);
+  __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                           int16_t *tmp_eve_buf,
+                                           int16_t *tmp_odd_buf, int16_t *dst) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+  __m128i reg0, reg1, reg2, reg3;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+            tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+            tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+            m4, m2, m6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 496);
+  __lsx_vst(reg1, tmp_buf, 368);
+  __lsx_vst(reg2, tmp_buf, 432);
+  __lsx_vst(reg3, tmp_buf, 304);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+            tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+            tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+            m5, m3, m7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 464);
+  __lsx_vst(reg1, tmp_buf, 336);
+  __lsx_vst(reg2, tmp_buf, 400);
+  __lsx_vst(reg3, tmp_buf, 272);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+            tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+            tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+            n4, n2, n6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 480);
+  __lsx_vst(reg1, tmp_buf, 352);
+  __lsx_vst(reg2, tmp_buf, 416);
+  __lsx_vst(reg3, tmp_buf, 288);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+            tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+            tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+            n5, n3, n7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 448);
+  __lsx_vst(reg1, tmp_buf, 320);
+  __lsx_vst(reg2, tmp_buf, 384);
+  __lsx_vst(reg3, tmp_buf, 256);
+
+  /* Transpose : 16 vectors */
+  /* 1st & 2nd 8x8 */
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  __lsx_vst(m0, dst, 0);
+  __lsx_vst(n0, dst, 64);
+  __lsx_vst(m1, dst, 128);
+  __lsx_vst(n1, dst, 192);
+  __lsx_vst(m2, dst, 256);
+  __lsx_vst(n2, dst, 320);
+  __lsx_vst(m3, dst, 384);
+  __lsx_vst(n3, dst, 448);
+
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+  __lsx_vst(m4, dst, 16);
+  __lsx_vst(n4, dst, 80);
+  __lsx_vst(m5, dst, 144);
+  __lsx_vst(n5, dst, 208);
+  __lsx_vst(m6, dst, 272);
+  __lsx_vst(n6, dst, 336);
+  __lsx_vst(m7, dst, 400);
+  __lsx_vst(n7, dst, 464);
+
+  /* 3rd & 4th 8x8 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304,
+            m0, n0, m1, n1);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368,
+            m2, n2, m3, n3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432,
+            m4, n4, m5, n5);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496,
+            m6, n6, m7, n7);
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+  __lsx_vst(m0, dst, 32);
+  __lsx_vst(n0, dst, 96);
+  __lsx_vst(m1, dst, 160);
+  __lsx_vst(n1, dst, 224);
+  __lsx_vst(m2, dst, 288);
+  __lsx_vst(n2, dst, 352);
+  __lsx_vst(m3, dst, 416);
+  __lsx_vst(n3, dst, 480);
+  __lsx_vst(m4, dst, 48);
+  __lsx_vst(n4, dst, 112);
+  __lsx_vst(m5, dst, 176);
+  __lsx_vst(n5, dst, 240);
+  __lsx_vst(m6, dst, 304);
+  __lsx_vst(n6, dst, 368);
+  __lsx_vst(m7, dst, 432);
+  __lsx_vst(n7, dst, 496);
+}
+
+static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) {
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct32x8_row_transpose_store(input, &tmp_buf[0]);
+  idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+  idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+                                 output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+  __m128i tmp0;
+
+  /* Even stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+            1792, reg4, reg5, reg6, reg7);
+  tmp_buf += 64;
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  /* Load 8 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+            1792, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = __lsx_vadd_h(reg0, reg4);
+  reg0 = __lsx_vsub_h(reg0, reg4);
+  reg4 = __lsx_vadd_h(reg6, reg2);
+  reg6 = __lsx_vsub_h(reg6, reg2);
+  reg2 = __lsx_vadd_h(reg1, reg5);
+  reg1 = __lsx_vsub_h(reg1, reg5);
+  reg5 = __lsx_vadd_h(reg7, reg3);
+  reg7 = __lsx_vsub_h(reg7, reg3);
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = __lsx_vadd_h(reg3, reg4);
+  reg3 = __lsx_vsub_h(reg3, reg4);
+  reg4 = __lsx_vsub_h(reg5, vec1);
+  reg5 = __lsx_vadd_h(reg5, vec1);
+
+  tmp0 = __lsx_vneg_h(reg6);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = __lsx_vsub_h(reg0, reg6);
+  reg0 = __lsx_vadd_h(reg0, reg6);
+  vec1 = __lsx_vsub_h(reg7, reg1);
+  reg7 = __lsx_vadd_h(reg7, reg1);
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 0);
+  __lsx_vst(loc3, tmp_eve_buf, 16);
+  __lsx_vst(loc2, tmp_eve_buf, 224);
+  __lsx_vst(loc0, tmp_eve_buf, 240);
+
+  LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 32);
+  __lsx_vst(loc3, tmp_eve_buf, 48);
+  __lsx_vst(loc2, tmp_eve_buf, 192);
+  __lsx_vst(loc0, tmp_eve_buf, 208);
+
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 64);
+  __lsx_vst(loc3, tmp_eve_buf, 80);
+  __lsx_vst(loc2, tmp_eve_buf, 160);
+  __lsx_vst(loc0, tmp_eve_buf, 176);
+
+  LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 96);
+  __lsx_vst(loc3, tmp_eve_buf, 112);
+  __lsx_vst(loc2, tmp_eve_buf, 128);
+  __lsx_vst(loc0, tmp_eve_buf, 144);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                              int16_t *tmp_odd_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf,
+            1984, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = __lsx_vadd_h(reg0, reg3);
+  reg0 = __lsx_vsub_h(reg0, reg3);
+  reg3 = __lsx_vadd_h(reg7, reg4);
+  reg7 = __lsx_vsub_h(reg7, reg4);
+  reg4 = __lsx_vadd_h(reg1, reg2);
+  reg1 = __lsx_vsub_h(reg1, reg2);
+  reg2 = __lsx_vadd_h(reg6, reg5);
+  reg6 = __lsx_vsub_h(reg6, reg5);
+  reg5 = vec0;
+
+  /* 4 Stores */
+  DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 64);
+  __lsx_vst(vec1, tmp_odd_buf, 80);
+  DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 0);
+  __lsx_vst(vec1, tmp_odd_buf, 16);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 96);
+  __lsx_vst(vec1, tmp_odd_buf, 112);
+  __lsx_vst(vec2, tmp_odd_buf, 32);
+  __lsx_vst(vec3, tmp_odd_buf, 48);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf,
+            1856, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+            vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+  LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+  __lsx_vst(vec0, tmp_odd_buf, 192);
+  __lsx_vst(vec1, tmp_odd_buf, 240);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 160);
+  __lsx_vst(vec1, tmp_odd_buf, 176);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0,
+            vec1, vec2, vec3);
+  LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  __lsx_vst(reg0, tmp_odd_buf, 208);
+  __lsx_vst(reg1, tmp_odd_buf, 224);
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  __lsx_vst(reg0, tmp_odd_buf, 128);
+  __lsx_vst(reg1, tmp_odd_buf, 144);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+            tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+            tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 0);
+  __lsx_vst(loc1, tmp_odd_buf, 16);
+  __lsx_vst(loc2, tmp_odd_buf, 32);
+  __lsx_vst(loc3, tmp_odd_buf, 48);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 128);
+  __lsx_vst(loc1, tmp_odd_buf, 144);
+  __lsx_vst(loc2, tmp_odd_buf, 160);
+  __lsx_vst(loc3, tmp_odd_buf, 176);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+            tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+            tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 64);
+  __lsx_vst(loc1, tmp_odd_buf, 80);
+  __lsx_vst(loc2, tmp_odd_buf, 96);
+  __lsx_vst(loc3, tmp_odd_buf, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 192);
+  __lsx_vst(loc1, tmp_odd_buf, 208);
+  __lsx_vst(loc2, tmp_odd_buf, 224);
+  __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                             int16_t *tmp_odd_buf, uint8_t *dst,
+                                             int32_t dst_stride) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+  int32_t stride = dst_stride << 2;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride + stride2;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+            tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+            tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+            m4, m2, m6);
+  DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+  VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6,
+            m2, m4, m0);
+  DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+  VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2,
+                      m4, m6);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+            tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+            tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+            m5, m3, m7);
+  DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+  VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3,
+                      m5, m7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7,
+            m3, m5, m1);
+  DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+  VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3,
+                      m5, m7);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+            tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+            tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+            n4, n2, n6);
+  DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+  VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4,
+                      n6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6,
+            n2, n4, n0);
+  DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+  VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2,
+                      n4, n6);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+            tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+            tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+            n5, n3, n7);
+  DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+  VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3,
+                      n5, n7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7,
+            n3, n5, n1);
+  DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+  VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3,
+                      n5, n7);
+}
+
+static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride) {
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+  idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+                                   dst_stride);
+}
+
+void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst,
+                                int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 4; ++i) {
+    /* process 32 * 8 block */
+    idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8)));
+  }
+
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+  __m128i zero = __lsx_vldi(0);
+
+  for (i = 32; i--;) {
+    __lsx_vst(zero, out_ptr, 0);
+    __lsx_vst(zero, out_ptr, 16);
+    __lsx_vst(zero, out_ptr, 32);
+    __lsx_vst(zero, out_ptr, 48);
+    out_ptr += 32;
+  }
+
+  out_ptr = out_arr;
+
+  /* rows: only upper-left 8x8 has non-zero coeff */
+  idct32x8_1d_rows_lsx(input, out_ptr);
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst,
+                             int32_t dst_stride) {
+  int32_t i;
+  int16_t out;
+  __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 6);
+
+  vec = __lsx_vreplgr2vr_h(out);
+
+  for (i = 16; i--;) {
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    dst2 = __lsx_vldx(dst, dst_stride);
+    dst3 = __lsx_vldx(dst + 16, dst_stride);
+
+    UNPCK_UB_SH(dst0, res0, res4);
+    UNPCK_UB_SH(dst1, res1, res5);
+    UNPCK_UB_SH(dst2, res2, res6);
+    UNPCK_UB_SH(dst3, res3, res7);
+
+    DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+              res1, res2, res3);
+    DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4,
+              res5, res6, res7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0,
+              res7, res3, 0, tmp0, tmp1, tmp2, tmp3);
+    __lsx_vst(tmp0, dst, 0);
+    __lsx_vst(tmp1, dst, 16);
+    dst += dst_stride;
+    __lsx_vst(tmp2, dst, 0);
+    __lsx_vst(tmp3, dst, 16);
+    dst += dst_stride;
+  }
+}
diff --git a/vpx_dsp/loongarch/intrapred_lsx.c b/vpx_dsp/loongarch/intrapred_lsx.c
new file mode 100644
index 0000000000..f990211791
--- /dev/null
+++ b/vpx_dsp/loongarch/intrapred_lsx.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t dst_stride) {
+  uint64_t val0, val1;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i store, sum_h, sum_w, sum_d;
+  __m128i src = { 0 };
+
+  val0 = *(const uint64_t *)src_top;
+  val1 = *(const uint64_t *)src_left;
+  DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src);
+  sum_h = __lsx_vhaddw_hu_bu(src, src);
+  sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vpickev_w(sum_d, sum_d);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vsrari_w(sum_d, 4);
+  store = __lsx_vreplvei_b(sum_w, 0);
+
+  __lsx_vstelm_d(store, dst, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+  dst += dst_stride_x4;
+  __lsx_vstelm_d(store, dst, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+}
+
+static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top,
+                                              const uint8_t *src_left,
+                                              uint8_t *dst,
+                                              int32_t dst_stride) {
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i top, left, out;
+  __m128i sum_h, sum_top, sum_left;
+  __m128i sum_w;
+  __m128i sum_d;
+
+  DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left);
+  DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left);
+  sum_h = __lsx_vadd_h(sum_top, sum_left);
+  sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vpickev_w(sum_d, sum_d);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vsrari_w(sum_d, 5);
+  out = __lsx_vreplvei_b(sum_w, 0);
+
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+}
+
+void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_lsx(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_lsx(above, left, dst, y_stride);
+}
diff --git a/vpx_dsp/loongarch/loopfilter_16_lsx.c b/vpx_dsp/loongarch/loopfilter_16_lsx.c
new file mode 100644
index 0000000000..0503df9966
--- /dev/null
+++ b/vpx_dsp/loongarch/loopfilter_16_lsx.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+#include "vpx_ports/mem.h"
+
+#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \
+                 _in2, _in3, _in4, _in5, _in6, _in7)                      \
+  do {                                                                    \
+    _in0 = __lsx_vld(_src, 0);                                            \
+    _in1 = __lsx_vldx(_src, _stride);                                     \
+    _in2 = __lsx_vldx(_src, _stride2);                                    \
+    _in3 = __lsx_vldx(_src, _stride3);                                    \
+    _src += _stride4;                                                     \
+    _in4 = __lsx_vld(_src, 0);                                            \
+    _in5 = __lsx_vldx(_src, _stride);                                     \
+    _in6 = __lsx_vldx(_src, _stride2);                                    \
+    _in7 = __lsx_vldx(_src, _stride3);                                    \
+  } while (0)
+
+#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \
+                 _stride, _stride2, _stride3, _stride4)                        \
+  do {                                                                         \
+    __lsx_vst(_dst0, _dst, 0);                                                 \
+    __lsx_vstx(_dst1, _dst, _stride);                                          \
+    __lsx_vstx(_dst2, _dst, _stride2);                                         \
+    __lsx_vstx(_dst3, _dst, _stride3);                                         \
+    _dst += _stride4;                                                          \
+    __lsx_vst(_dst4, _dst, 0);                                                 \
+    __lsx_vstx(_dst5, _dst, _stride);                                          \
+    __lsx_vstx(_dst6, _dst, _stride2);                                         \
+    __lsx_vstx(_dst7, _dst, _stride3);                                         \
+  } while (0)
+
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
+                                    uint8_t *filter48,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vstx(p1_out, dst, -stride2);
+    __lsx_vstx(p0_out, dst, -stride);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vstx(q1_out, dst, stride);
+
+    return 1;
+  }
+
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+            p0_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+            q3_l);
+
+  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+  DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+              p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+  /* convert 16 bit output data into 8 bit */
+  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+            p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+            p1_filt8_l, p0_filt8_l, q0_filt8_l);
+  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+            q1_filt8_l, q2_filt8_l);
+
+  /* store pixel values */
+  DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat,
+            p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out,
+            p0_out, q0_out);
+  DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat,
+            q1_out, q2_out);
+
+  __lsx_vst(p2_out, filter48, 0);
+  __lsx_vst(p1_out, filter48, 16);
+  __lsx_vst(p0_out, filter48, 32);
+  __lsx_vst(q0_out, filter48, 48);
+  __lsx_vst(q1_out, filter48, 64);
+  __lsx_vst(q2_out, filter48, 80);
+  __lsx_vst(flat, filter48, 96);
+
+  return 0;
+}
+
+static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  uint8_t *dst_tmp0 = dst - stride4;
+  uint8_t *dst_tmp1 = dst + stride4;
+
+  __m128i flat, flat2, filter8;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i out_h, out_l;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+
+  flat = __lsx_vld(filter48, 96);
+
+  DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+            -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+
+  p3 = __lsx_vld(dst_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1);
+  p0 = __lsx_vldx(dst_tmp0, stride3);
+
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  q4 = __lsx_vld(dst_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+  q7 = __lsx_vldx(dst_tmp1, stride3);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__lsx_bz_v(flat2)) {
+    DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+              p2, p1, p0, q0);
+    DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+    __lsx_vstx(p2, dst, -stride3);
+    __lsx_vstx(p1, dst, -stride2);
+    __lsx_vstx(p0, dst, -stride);
+    __lsx_vst(q0, dst, 0);
+    __lsx_vstx(q1, dst, stride);
+    __lsx_vstx(q2, dst, stride2);
+  } else {
+    dst = dst_tmp0 - stride3;
+
+    p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+    p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+    p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+    p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+    p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+    p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+    p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+    p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+    q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+    p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+    p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+    p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+    p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+    p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+    p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+    p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+    q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+    tmp0_h = p7_h_in << 3;
+    tmp0_h -= p7_h_in;
+    tmp0_h += p6_h_in;
+    tmp0_h += q0_h_in;
+    tmp1_h = p6_h_in + p5_h_in;
+    tmp1_h += p4_h_in;
+    tmp1_h += p3_h_in;
+    tmp1_h += p2_h_in;
+    tmp1_h += p1_h_in;
+    tmp1_h += p0_h_in;
+    tmp1_h += tmp0_h;
+
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+    __lsx_vst(p6, dst, 0);
+    dst += stride;
+
+    /* p5 */
+    q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+    tmp0_h = p5_h_in - p6_h_in;
+    tmp0_h += q1_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+    __lsx_vst(p5, dst, 0);
+    dst += stride;
+
+    /* p4 */
+    q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+    tmp0_h = p4_h_in - p5_h_in;
+    tmp0_h += q2_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+    __lsx_vst(p4, dst, 0);
+    dst += stride;
+
+    /* p3 */
+    q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+    tmp0_h = p3_h_in - p4_h_in;
+    tmp0_h += q3_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+    __lsx_vst(p3, dst, 0);
+    dst += stride;
+
+    /* p2 */
+    q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+    filter8 = __lsx_vld(filter48, 0);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+    tmp0_h = p2_h_in - p3_h_in;
+    tmp0_h += q4_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* p1 */
+    q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+    filter8 = __lsx_vld(filter48, 16);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+    tmp0_h = p1_h_in - p2_h_in;
+    tmp0_h += q5_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* p0 */
+    q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+    filter8 = __lsx_vld(filter48, 32);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+    tmp0_h = p0_h_in - p1_h_in;
+    tmp0_h += q6_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q0 */
+    q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+    filter8 = __lsx_vld(filter48, 48);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+    tmp0_h = q7_h_in - p0_h_in;
+    tmp0_h += q0_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q1 */
+    filter8 = __lsx_vld(filter48, 64);
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q0_h_in;
+    tmp0_h += q1_h_in;
+    tmp0_h -= p6_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q2 */
+    filter8 = __lsx_vld(filter48, 80);
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q1_h_in;
+    tmp0_h += q2_h_in;
+    tmp0_h -= p5_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q3 */
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q2_h_in;
+    tmp0_h += q3_h_in;
+    tmp0_h -= p4_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+    __lsx_vst(q3, dst, 0);
+    dst += stride;
+
+    /* q4 */
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q3_h_in;
+    tmp0_h += q4_h_in;
+    tmp0_h -= p3_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+    __lsx_vst(q4, dst, 0);
+    dst += stride;
+
+    /* q5 */
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q4_h_in;
+    tmp0_h += q5_h_in;
+    tmp0_h -= p2_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+    __lsx_vst(q5, dst, 0);
+    dst += stride;
+
+    /* q6 */
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q5_h_in;
+    tmp0_h += q6_h_in;
+    tmp0_h -= p1_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+    __lsx_vst(q6, dst, 0);
+  }
+}
+
+static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride,
+                                        const uint8_t *b_limit_ptr,
+                                        const uint8_t *limit_ptr,
+                                        const uint8_t *thresh_ptr) {
+  DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]);
+  uint8_t early_exit = 0;
+
+  early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr,
+                                    limit_ptr, thresh_ptr);
+
+  if (early_exit == 0) {
+    hz_lpf_t16_16w(dst, stride, filter48);
+  }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr, int32_t count) {
+  if (count == 1) {
+    __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    __m128i p0_filter16, p1_filter16;
+    __m128i p2_filter8, p1_filter8, p0_filter8;
+    __m128i q0_filter8, q1_filter8, q2_filter8;
+    __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
+    __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp0, tmp1, tmp2;
+
+    int32_t stride2 = stride << 1;
+    int32_t stride3 = 2 + stride;
+    int32_t stride4 = stride << 2;
+    uint8_t *dst_tmp0 = dst - stride4;
+    uint8_t *dst_tmp1 = dst + stride4;
+
+    /* load vector elements */
+    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+              -stride, p3, p2, p1, p0);
+    q0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+    q3 = __lsx_vldx(dst, stride3);
+
+    thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+    b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+    limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+    /* filter_mask* */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+                 mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+    flat = __lsx_vilvl_d(zero, flat);
+    if (__lsx_bz_v(flat)) {
+      __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+      __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+      __lsx_vstelm_d(q0_out, dst, 0, 0);
+      __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+    } else {
+      /* convert 8 bit input data into 16 bit */
+      DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l,
+                p2_l, p1_l, p0_l);
+      DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l,
+                q1_l, q2_l, q3_l);
+      VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+      /* convert 16 bit output data into 8 bit */
+      DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero,
+                p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8,
+                p0_filter8, q0_filter8);
+      DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8,
+                q2_filter8);
+
+      /* store pixel values */
+      p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
+      p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
+      p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
+      q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
+      q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
+      q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
+
+      /* load 16 vector elements */
+      DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+                -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+      q4 = __lsx_vld(dst_tmp1, 0);
+      DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+      q7 = __lsx_vldx(dst_tmp1, stride3);
+
+      VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+      if (__lsx_bz_v(flat2)) {
+        dst -= stride3;
+        __lsx_vstelm_d(p2_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p0_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q0_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q1_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q2_out, dst, 0, 0);
+      } else {
+        /* LSB(right) 8 pixel operation */
+        DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l,
+                  p6_l, p5_l, p4_l);
+        DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l,
+                  q5_l, q6_l, q7_l);
+
+        tmp0 = __lsx_vslli_h(p7_l, 3);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp0 = __lsx_vadd_h(tmp0, p6_l);
+        tmp0 = __lsx_vadd_h(tmp0, q0_l);
+
+        dst = dst_tmp0 - stride3;
+
+        /* calculation of p6 and p5 */
+        tmp1 = __lsx_vadd_h(p6_l, p5_l);
+        tmp1 = __lsx_vadd_h(tmp1, p4_l);
+        tmp1 = __lsx_vadd_h(tmp1, p3_l);
+        tmp1 = __lsx_vadd_h(tmp1, p2_l);
+        tmp1 = __lsx_vadd_h(tmp1, p1_l);
+        tmp1 = __lsx_vadd_h(tmp1, p0_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp0 = __lsx_vsub_h(p5_l, p6_l);
+        tmp0 = __lsx_vadd_h(tmp0, q1_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p4 and p3 */
+        tmp0 = __lsx_vsub_h(p4_l, p5_l);
+        tmp0 = __lsx_vadd_h(tmp0, q2_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(p3_l, p4_l);
+        tmp2 = __lsx_vadd_h(tmp2, q3_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p2 and p1 */
+        tmp0 = __lsx_vsub_h(p2_l, p3_l);
+        tmp0 = __lsx_vadd_h(tmp0, q4_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(p1_l, p2_l);
+        tmp2 = __lsx_vadd_h(tmp2, q5_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p0 and q0 */
+        tmp0 = __lsx_vsub_h(p0_l, p1_l);
+        tmp0 = __lsx_vadd_h(tmp0, q6_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(q7_l, p0_l);
+        tmp2 = __lsx_vadd_h(tmp2, q0_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q1 and q2 */
+        tmp0 = __lsx_vsub_h(q7_l, q0_l);
+        tmp0 = __lsx_vadd_h(tmp0, q1_l);
+        tmp0 = __lsx_vsub_h(tmp0, p6_l);
+        tmp2 = __lsx_vsub_h(q7_l, q1_l);
+        tmp2 = __lsx_vadd_h(tmp2, q2_l);
+        tmp2 = __lsx_vsub_h(tmp2, p5_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q3 and q4 */
+        tmp0 = __lsx_vsub_h(q7_l, q2_l);
+        tmp0 = __lsx_vadd_h(tmp0, q3_l);
+        tmp0 = __lsx_vsub_h(tmp0, p4_l);
+        tmp2 = __lsx_vsub_h(q7_l, q3_l);
+        tmp2 = __lsx_vadd_h(tmp2, q4_l);
+        tmp2 = __lsx_vsub_h(tmp2, p3_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q5 and q6 */
+        tmp0 = __lsx_vsub_h(q7_l, q4_l);
+        tmp0 = __lsx_vadd_h(tmp0, q5_l);
+        tmp0 = __lsx_vsub_h(tmp0, p2_l);
+        tmp2 = __lsx_vsub_h(q7_l, q5_l);
+        tmp2 = __lsx_vadd_h(tmp2, q6_l);
+        tmp2 = __lsx_vsub_h(tmp2, p1_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+      }
+    }
+  } else {
+    mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr,
+                                thresh_ptr);
+  }
+}
+
+void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output,
+                            int32_t out_stride) {
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp2, tmp3;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  int32_t in_stride2 = in_stride << 1;
+  int32_t in_stride3 = in_stride2 + in_stride;
+  int32_t in_stride4 = in_stride2 << 1;
+  int32_t out_stride2 = out_stride << 1;
+  int32_t out_stride3 = out_stride2 + out_stride;
+  int32_t out_stride4 = out_stride2 << 1;
+
+  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1,
+           row2, row3, row4, row5, row6, row7);
+  input += in_stride4;
+  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9,
+           row10, row11, row12, row13, row14, row15);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
+                      p5, p4, p3, p2, p1, p0);
+
+  /* transpose 16x8 matrix into 8x16 */
+  /* total 8 intermediate register and 32 instructions */
+  q7 = __lsx_vpackod_d(row8, row0);
+  q6 = __lsx_vpackod_d(row9, row1);
+  q5 = __lsx_vpackod_d(row10, row2);
+  q4 = __lsx_vpackod_d(row11, row3);
+  q3 = __lsx_vpackod_d(row12, row4);
+  q2 = __lsx_vpackod_d(row13, row5);
+  q1 = __lsx_vpackod_d(row14, row6);
+  q0 = __lsx_vpackod_d(row15, row7);
+
+  DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
+
+  DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
+  DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
+
+  DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
+  q0 = __lsx_vpackev_w(tmp3, tmp2);
+  q4 = __lsx_vpackod_w(tmp3, tmp2);
+
+  tmp2 = __lsx_vpackod_h(tmp1, tmp0);
+  tmp3 = __lsx_vpackod_h(q7, q5);
+  q2 = __lsx_vpackev_w(tmp3, tmp2);
+  q6 = __lsx_vpackod_w(tmp3, tmp2);
+
+  DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
+  q1 = __lsx_vpackev_w(tmp3, tmp2);
+  q5 = __lsx_vpackod_w(tmp3, tmp2);
+
+  tmp2 = __lsx_vpackod_h(tmp5, tmp4);
+  tmp3 = __lsx_vpackod_h(tmp7, tmp6);
+  q3 = __lsx_vpackev_w(tmp3, tmp2);
+  q7 = __lsx_vpackod_w(tmp3, tmp2);
+
+  LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2,
+           out_stride3, out_stride4);
+  output += out_stride4;
+  LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2,
+           out_stride3, out_stride4);
+}
+
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
+                                    uint8_t *dst_org, int32_t stride,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0);
+  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec2 = __lsx_vilvl_h(vec1, vec0);
+    vec3 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec4 = __lsx_vilvl_h(vec1, vec0);
+    vec5 = __lsx_vilvh_h(vec1, vec0);
+
+    dst_org -= 2;
+    __lsx_vstelm_w(vec2, dst_org, 0, 0);
+    __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec3, dst_org, 0, 0);
+    __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec4, dst_org, 0, 0);
+    __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec5, dst_org, 0, 0);
+    __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
+
+    return 1;
+  }
+
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+            p0_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+            q3_l);
+  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+  DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+              p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+  /* convert 16 bit output data into 8 bit */
+  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+            p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+            p1_filt8_l, p0_filt8_l, q0_filt8_l);
+  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+            q1_filt8_l, q2_filt8_l);
+
+  /* store pixel values */
+  p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+  p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+  p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+  q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+  q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+  q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+  __lsx_vst(p2_out, filter48, 0);
+  __lsx_vst(p1_out, filter48, 16);
+  __lsx_vst(p0_out, filter48, 32);
+  __lsx_vst(q0_out, filter48, 48);
+  __lsx_vst(q1_out, filter48, 64);
+  __lsx_vst(q2_out, filter48, 80);
+  __lsx_vst(flat, filter48, 96);
+
+  return 0;
+}
+
+static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
+                              uint8_t *filter48) {
+  __m128i flat, flat2, filter8;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i out_l, out_h;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+  uint8_t *dst_tmp = dst - 128;
+
+  flat = __lsx_vld(filter48, 96);
+
+  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7,
+            p6, p5, p4);
+  DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+  DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+  /* if flat2 is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat2)) {
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+              p2, p1, p0, q0);
+    DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
+    vec3 = __lsx_vilvl_h(vec1, vec0);
+    vec4 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
+    vec6 = __lsx_vilvl_h(vec1, vec0);
+    vec7 = __lsx_vilvh_h(vec1, vec0);
+    vec2 = __lsx_vilvl_b(q2, q1);
+    vec5 = __lsx_vilvh_b(q2, q1);
+
+    dst_org -= 3;
+    __lsx_vstelm_w(vec3, dst_org, 0, 0);
+    __lsx_vstelm_h(vec2, dst_org, 4, 0);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 1);
+    __lsx_vstelm_h(vec2, dst_org, 4, 1);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 2);
+    __lsx_vstelm_h(vec2, dst_org, 4, 2);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 3);
+    __lsx_vstelm_h(vec2, dst_org, 4, 3);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 0);
+    __lsx_vstelm_h(vec2, dst_org, 4, 4);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 1);
+    __lsx_vstelm_h(vec2, dst_org, 4, 5);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 2);
+    __lsx_vstelm_h(vec2, dst_org, 4, 6);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 3);
+    __lsx_vstelm_h(vec2, dst_org, 4, 7);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 0);
+    __lsx_vstelm_h(vec5, dst_org, 4, 0);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 1);
+    __lsx_vstelm_h(vec5, dst_org, 4, 1);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 2);
+    __lsx_vstelm_h(vec5, dst_org, 4, 2);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 3);
+    __lsx_vstelm_h(vec5, dst_org, 4, 3);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 0);
+    __lsx_vstelm_h(vec5, dst_org, 4, 4);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 1);
+    __lsx_vstelm_h(vec5, dst_org, 4, 5);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 2);
+    __lsx_vstelm_h(vec5, dst_org, 4, 6);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 3);
+    __lsx_vstelm_h(vec5, dst_org, 4, 7);
+
+    return 1;
+  }
+
+  dst -= 7 * 16;
+
+  p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+  p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+  p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+  p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+  p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+  p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+  p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+  p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+  q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+  tmp0_l = p7_l_in << 3;
+  tmp0_l -= p7_l_in;
+  tmp0_l += p6_l_in;
+  tmp0_l += q0_l_in;
+  tmp1_l = p6_l_in + p5_l_in;
+  tmp1_l += p4_l_in;
+  tmp1_l += p3_l_in;
+  tmp1_l += p2_l_in;
+  tmp1_l += p1_l_in;
+  tmp1_l += p0_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+  p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+  p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+  p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+  p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+  p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+  p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+  p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+  q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+  tmp0_h = p7_h_in << 3;
+  tmp0_h -= p7_h_in;
+  tmp0_h += p6_h_in;
+  tmp0_h += q0_h_in;
+  tmp1_h = p6_h_in + p5_h_in;
+  tmp1_h += p4_h_in;
+  tmp1_h += p3_h_in;
+  tmp1_h += p2_h_in;
+  tmp1_h += p1_h_in;
+  tmp1_h += p0_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+  __lsx_vst(p6, dst, 0);
+
+  /* p5 */
+  q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+  tmp0_l = p5_l_in - p6_l_in;
+  tmp0_l += q1_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+  tmp0_h = p5_h_in - p6_h_in;
+  tmp0_h += q1_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+  __lsx_vst(p5, dst, 16);
+
+  /* p4 */
+  q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+  tmp0_l = p4_l_in - p5_l_in;
+  tmp0_l += q2_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+  tmp0_h = p4_h_in - p5_h_in;
+  tmp0_h += q2_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+  __lsx_vst(p4, dst, 16 * 2);
+
+  /* p3 */
+  q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+  tmp0_l = p3_l_in - p4_l_in;
+  tmp0_l += q3_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+  tmp0_h = p3_h_in - p4_h_in;
+  tmp0_h += q3_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+  __lsx_vst(p3, dst, 16 * 3);
+
+  /* p2 */
+  q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+  filter8 = __lsx_vld(filter48, 0);
+  tmp0_l = p2_l_in - p3_l_in;
+  tmp0_l += q4_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+  tmp0_h = p2_h_in - p3_h_in;
+  tmp0_h += q4_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 4);
+
+  /* p1 */
+  q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+  filter8 = __lsx_vld(filter48, 16);
+  tmp0_l = p1_l_in - p2_l_in;
+  tmp0_l += q5_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+  tmp0_h = p1_h_in - p2_h_in;
+  tmp0_h += q5_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 5);
+
+  /* p0 */
+  q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+  filter8 = __lsx_vld(filter48, 32);
+  tmp0_l = p0_l_in - p1_l_in;
+  tmp0_l += q6_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+  tmp0_h = p0_h_in - p1_h_in;
+  tmp0_h += q6_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 6);
+
+  /* q0 */
+  q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+  filter8 = __lsx_vld(filter48, 48);
+  tmp0_l = q7_l_in - p0_l_in;
+  tmp0_l += q0_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+  tmp0_h = q7_h_in - p0_h_in;
+  tmp0_h += q0_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 7);
+
+  /* q1 */
+  filter8 = __lsx_vld(filter48, 64);
+  tmp0_l = q7_l_in - q0_l_in;
+  tmp0_l += q1_l_in;
+  tmp0_l -= p6_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q0_h_in;
+  tmp0_h += q1_h_in;
+  tmp0_h -= p6_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 8);
+
+  /* q2 */
+  filter8 = __lsx_vld(filter48, 80);
+  tmp0_l = q7_l_in - q1_l_in;
+  tmp0_l += q2_l_in;
+  tmp0_l -= p5_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q1_h_in;
+  tmp0_h += q2_h_in;
+  tmp0_h -= p5_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 9);
+
+  /* q3 */
+  tmp0_l = q7_l_in - q2_l_in;
+  tmp0_l += q3_l_in;
+  tmp0_l -= p4_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q2_h_in;
+  tmp0_h += q3_h_in;
+  tmp0_h -= p4_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+  __lsx_vst(q3, dst, 16 * 10);
+
+  /* q4 */
+  tmp0_l = q7_l_in - q3_l_in;
+  tmp0_l += q4_l_in;
+  tmp0_l -= p3_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q3_h_in;
+  tmp0_h += q4_h_in;
+  tmp0_h -= p3_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+  __lsx_vst(q4, dst, 16 * 11);
+
+  /* q5 */
+  tmp0_l = q7_l_in - q4_l_in;
+  tmp0_l += q5_l_in;
+  tmp0_l -= p2_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q4_h_in;
+  tmp0_h += q5_h_in;
+  tmp0_h -= p2_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+  __lsx_vst(q5, dst, 16 * 12);
+
+  /* q6 */
+  tmp0_l = q7_l_in - q5_l_in;
+  tmp0_l += q6_l_in;
+  tmp0_l -= p1_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q5_h_in;
+  tmp0_h += q6_h_in;
+  tmp0_h -= p1_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+  __lsx_vst(q6, dst, 16 * 13);
+
+  return 0;
+}
+
+void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch,
+                                  const uint8_t *b_limit_ptr,
+                                  const uint8_t *limit_ptr,
+                                  const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+  early_exit =
+      vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                           pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+  if (early_exit == 0) {
+    early_exit =
+        vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
+
+    if (early_exit == 0) {
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/loopfilter_4_lsx.c b/vpx_dsp/loongarch/loopfilter_4_lsx.c
new file mode 100644
index 0000000000..9300b5c5ae
--- /dev/null
+++ b/vpx_dsp/loongarch/loopfilter_4_lsx.c
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, thresh, b_limit, limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+            p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+  q3 = __lsx_vldx(src, pitch3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  __lsx_vstelm_d(p1_out, src - pitch2, 0, 0);
+  __lsx_vstelm_d(p0_out, src - pitch, 0, 0);
+  __lsx_vstelm_d(q0_out, src, 0, 0);
+  __lsx_vstelm_d(q1_out, src + pitch, 0, 0);
+}
+
+void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0_ptr,
+                                   const uint8_t *limit0_ptr,
+                                   const uint8_t *thresh0_ptr,
+                                   const uint8_t *b_limit1_ptr,
+                                   const uint8_t *limit1_ptr,
+                                   const uint8_t *thresh1_ptr) {
+  __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+            p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+  q3 = __lsx_vldx(src, pitch3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+  __lsx_vstx(p1, src, -pitch2);
+  __lsx_vstx(p0, src, -pitch);
+  __lsx_vst(q0, src, 0);
+  __lsx_vstx(q1, src, pitch);
+}
+
+void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, limit, thresh, b_limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+  uint8_t *src_tmp = src - 4;
+
+  p3 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1);
+  p0 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  q0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2);
+  q3 = __lsx_vldx(src_tmp, pitch3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
+  vec2 = __lsx_vilvl_h(vec1, vec0);
+  vec3 = __lsx_vilvh_h(vec1, vec0);
+
+  src -= 2;
+  __lsx_vstelm_w(vec2, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(vec3, src, 0, 0);
+  __lsx_vstelm_w(vec3, src + pitch, 0, 1);
+  __lsx_vstelm_w(vec3, src + pitch2, 0, 2);
+  __lsx_vstelm_w(vec3, src + pitch3, 0, 3);
+}
+
+void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0_ptr,
+                                 const uint8_t *limit0_ptr,
+                                 const uint8_t *thresh0_ptr,
+                                 const uint8_t *b_limit1_ptr,
+                                 const uint8_t *limit1_ptr,
+                                 const uint8_t *thresh1_ptr) {
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+  uint8_t *src_tmp = src - 4;
+
+  row0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2);
+  row3 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row4 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6);
+  row7 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row8 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10);
+  row11 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row12 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14);
+  row15 = __lsx_vldx(src_tmp, pitch3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src -= 2;
+  __lsx_vstelm_w(tmp2, src, 0, 0);
+  __lsx_vstelm_w(tmp2, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp2, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp2, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp3, src, 0, 0);
+  __lsx_vstelm_w(tmp3, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp3, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp3, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp4, src, 0, 0);
+  __lsx_vstelm_w(tmp4, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp4, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp4, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp5, src, 0, 0);
+  __lsx_vstelm_w(tmp5, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp5, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp5, src + pitch3, 0, 3);
+}
diff --git a/vpx_dsp/loongarch/loopfilter_8_lsx.c b/vpx_dsp/loongarch/loopfilter_8_lsx.c
new file mode 100644
index 0000000000..00219ba71d
--- /dev/null
+++ b/vpx_dsp/loongarch/loopfilter_8_lsx.c
@@ -0,0 +1,458 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, thresh, b_limit, limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out;
+  __m128i p2_filter8, p1_filter8, p0_filter8;
+  __m128i q0_filter8, q1_filter8, q2_filter8;
+  __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = __lsx_vilvl_d(flat, flat);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+    __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+    __lsx_vstelm_d(q0_out, dst, 0, 0);
+    __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+  } else {
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+    DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8,
+              p1_filter8, q0_filter8);
+    q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8);
+
+    p2 = __lsx_vilvl_d(p1_out, p2);
+    p0_out = __lsx_vilvl_d(q0_out, p0_out);
+    q1_out = __lsx_vilvl_d(q2, q1_out);
+
+    DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat,
+              p2_out, p1_out);
+    p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat);
+    dst -= stride3;
+
+    __lsx_vstelm_d(p2_out, dst, 0, 0);
+    __lsx_vstelm_d(p2_out, dst + stride, 0, 1);
+    __lsx_vstelm_d(p1_out, dst + stride2, 0, 0);
+    __lsx_vstelm_d(p1_out, dst + stride3, 0, 1);
+
+    dst += stride4;
+    __lsx_vstelm_d(p0_out, dst, 0, 0);
+    dst += stride;
+    __lsx_vstelm_d(p0_out, dst, 0, 1);
+  }
+}
+
+void vpx_lpf_horizontal_8_dual_lsx(
+    uint8_t *dst, int32_t stride, const uint8_t *b_limit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vldrepl_b(thresh0, 0);
+  p2_out = __lsx_vldrepl_b(thresh1, 0);
+  thresh = __lsx_vilvl_d(p2_out, thresh);
+
+  b_limit = __lsx_vldrepl_b(b_limit0, 0);
+  p2_out = __lsx_vldrepl_b(b_limit1, 0);
+  b_limit = __lsx_vilvl_d(p2_out, b_limit);
+
+  limit = __lsx_vldrepl_b(limit0, 0);
+  p2_out = __lsx_vldrepl_b(limit1, 0);
+  limit = __lsx_vilvl_d(p2_out, limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vst(p1_out, dst - stride2, 0);
+    __lsx_vst(p0_out, dst - stride, 0);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vst(q1_out, dst + stride, 0);
+  } else {
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+    VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+                p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+              p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+
+    /* store pixel values */
+    p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    __lsx_vst(p2_out, dst - stride3, 0);
+    __lsx_vst(p1_out, dst - stride2, 0);
+    __lsx_vst(p0_out, dst - stride, 0);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vst(q1_out, dst + stride, 0);
+    __lsx_vst(q2_out, dst + stride2, 0);
+  }
+}
+
+void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p1_out, p0_out, q0_out, q1_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  uint8_t *dst_tmp = dst - 4;
+
+  /* load vector elements */
+  p3 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
+  p0 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  q0 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
+  q3 = __lsx_vldx(dst_tmp, stride3);
+
+  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = __lsx_vilvl_d(zero, flat);
+
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    /* Store 4 pixels p1-_q1 */
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    p2 = __lsx_vilvl_h(p1, p0);
+    p3 = __lsx_vilvh_h(p1, p0);
+
+    dst -= 2;
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_w(p2, dst + stride, 0, 1);
+    __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(p3, dst, 0, 0);
+    __lsx_vstelm_w(p3, dst + stride, 0, 1);
+    __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+  } else {
+    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+              p1_l, p0_l);
+    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+              q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+              p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+    /* store pixel values */
+    p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    /* Store 6 pixels p2-_q2 */
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+    p1 = __lsx_vilvl_h(q3, p3);
+    p2 = __lsx_vilvh_h(q3, p3);
+    p3 = __lsx_vilvl_b(q2, q1);
+    dst -= 3;
+    __lsx_vstelm_w(p1, dst, 0, 0);
+    __lsx_vstelm_h(p3, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(p1, dst, 0, 1);
+    __lsx_vstelm_h(p3, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(p1, dst, 0, 2);
+    __lsx_vstelm_h(p3, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(p1, dst, 0, 3);
+    __lsx_vstelm_h(p3, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_h(p3, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 1);
+    __lsx_vstelm_h(p3, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 2);
+    __lsx_vstelm_h(p3, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 3);
+    __lsx_vstelm_h(p3, dst, 4, 7);
+  }
+}
+
+void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
+                                 const uint8_t *b_limit0, const uint8_t *limit0,
+                                 const uint8_t *thresh0,
+                                 const uint8_t *b_limit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  uint8_t *dst_tmp = dst - 4;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p1_out, p0_out, q0_out, q1_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i row4, row5, row6, row7, row12, row13, row14, row15;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  p0 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
+  p3 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  row4 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
+  row7 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+
+  q3 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
+  q0 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  row12 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
+  row15 = __lsx_vldx(dst_tmp, stride3);
+
+  /* transpose 16x8 matrix into 8x16 */
+  LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+                      q3);
+
+  thresh = __lsx_vldrepl_b(thresh0, 0);
+  p1_out = __lsx_vldrepl_b(thresh1, 0);
+  thresh = __lsx_vilvl_d(p1_out, thresh);
+
+  b_limit = __lsx_vldrepl_b(b_limit0, 0);
+  p1_out = __lsx_vldrepl_b(b_limit1, 0);
+  b_limit = __lsx_vilvl_d(p1_out, b_limit);
+
+  limit = __lsx_vldrepl_b(limit0, 0);
+  p1_out = __lsx_vldrepl_b(limit1, 0);
+  limit = __lsx_vilvl_d(p1_out, limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    p2 = __lsx_vilvl_h(p1, p0);
+    p3 = __lsx_vilvh_h(p1, p0);
+    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    q2 = __lsx_vilvl_h(p1, p0);
+    q3 = __lsx_vilvh_h(p1, p0);
+    dst -= 2;
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_w(p2, dst + stride, 0, 1);
+    __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(p3, dst, 0, 0);
+    __lsx_vstelm_w(p3, dst + stride, 0, 1);
+    __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(q2, dst, 0, 0);
+    __lsx_vstelm_w(q2, dst + stride, 0, 1);
+    __lsx_vstelm_w(q2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(q2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(q3, dst, 0, 0);
+    __lsx_vstelm_w(q3, dst + stride, 0, 1);
+    __lsx_vstelm_w(q3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(q3, dst + stride3, 0, 3);
+  } else {
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+    DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+
+    /* filter8 */
+    VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+                p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+              p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+
+    /* store pixel values */
+    p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+    p2_filt8_l = __lsx_vilvl_h(q3, p3);
+    p2_filt8_h = __lsx_vilvh_h(q3, p3);
+    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3);
+    p0_filt8_l = __lsx_vilvl_h(q3, p3);
+    p0_filt8_h = __lsx_vilvh_h(q3, p3);
+    q1_filt8_l = __lsx_vilvl_b(q2, q1);
+    q1_filt8_h = __lsx_vilvh_b(q2, q1);
+
+    dst -= 3;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 7);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 7);
+  }
+}
diff --git a/vpx_dsp/loongarch/loopfilter_lsx.h b/vpx_dsp/loongarch/loopfilter_lsx.h
new file mode 100644
index 0000000000..1c43836503
--- /dev/null
+++ b/vpx_dsp/loongarch/loopfilter_lsx.h
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  do {                                                                       \
+    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
+    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
+                                                                             \
+    /* absolute subtraction of pixel values */                               \
+    p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in);                             \
+    p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in);                             \
+    p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in);                             \
+    q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in);                             \
+    q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in);                             \
+    q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in);                             \
+    p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in);                             \
+    p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in);                             \
+                                                                             \
+    /* calculation of hev */                                                 \
+    flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = __lsx_vslt_bu(thresh_in, flat_out);                            \
+                                                                             \
+    /* calculation of mask */                                                \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                           \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);               \
+    mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m);                      \
+    mask_out = __lsx_vmax_bu(flat_out, mask_out);                            \
+    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
+                                                                             \
+    mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
+    mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
+  } while (0)
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)          \
+  do {                                                                         \
+    __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0;                    \
+    __m128i flat4_tmp = __lsx_vldi(1);                                         \
+                                                                               \
+    DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \
+              q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0);          \
+    p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0);                        \
+    flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out);                            \
+    p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0);                        \
+    flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out);                            \
+                                                                               \
+    flat_out = __lsx_vslt_bu(flat4_tmp, flat_out);                             \
+    flat_out = __lsx_vxori_b(flat_out, 0xff);                                  \
+    flat_out = flat_out & (mask);                                              \
+  } while (0)
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in,      \
+                  q6_in, q7_in, flat_in, flat2_out)                            \
+  do {                                                                         \
+    __m128i flat5_tmp = __lsx_vldi(1);                                         \
+    __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0;                    \
+    __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0;                    \
+    DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \
+              q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0);          \
+    DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \
+              q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0);          \
+                                                                               \
+    DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0,   \
+              p4_asub_p0, flat2_out);                                          \
+    flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out);                          \
+    p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0);                        \
+    flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out);                          \
+    p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0);                        \
+    flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out);                          \
+    flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out);                           \
+    flat2_out = __lsx_vxori_b(flat2_out, 0xff);                                \
+    flat2_out = flat2_out & flat_in;                                           \
+  } while (0)
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out,  \
+                           p0_out, q0_out, q1_out)                         \
+  do {                                                                     \
+    __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;               \
+    const __m128i cnst4b = __lsx_vldi(4);                                  \
+    const __m128i cnst3b = __lsx_vldi(3);                                  \
+    DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \
+              0x80, p1_m, p0_m, q0_m, q1_m);                               \
+    filt = __lsx_vssub_b(p1_m, q1_m);                                      \
+    filt &= hev;                                                           \
+                                                                           \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt &= mask;                                                          \
+    DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2);          \
+    DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2);                        \
+                                                                           \
+    q0_m = __lsx_vssub_b(q0_m, t1);                                        \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                                        \
+    DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out);      \
+                                                                           \
+    filt = __lsx_vsrari_b(t1, 1);                                          \
+    hev = __lsx_vxori_b(hev, 0xff);                                        \
+    filt &= hev;                                                           \
+    q1_m = __lsx_vssub_b(q1_m, filt);                                      \
+    p1_m = __lsx_vsadd_b(p1_m, filt);                                      \
+    DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out);      \
+  } while (0)
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+                    q1_filt8_out, q2_filt8_out)                             \
+  do {                                                                      \
+    __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                          \
+                                                                            \
+    tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in);                               \
+    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
+    tmp_filt8_0 = __lsx_vslli_h(p3_in, 1);                                  \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2);                   \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in);                         \
+    p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in);                         \
+    p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in);                         \
+    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1);                   \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
+    p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3);                          \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in);                               \
+    tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0);                         \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
+    tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0);                   \
+    q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
+    q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in);                         \
+    tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
+    q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c
new file mode 100644
index 0000000000..9bb1691e2e
--- /dev/null
+++ b/vpx_dsp/loongarch/quantize_lsx.c
@@ -0,0 +1,244 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
+                                       __m128i round, __m128i quant,
+                                       __m128i shift, __m128i cmp_mask) {
+  __m128i rounded, qcoeff;
+
+  rounded = __lsx_vsadd_h(coeff_abs, round);
+  qcoeff = __lsx_vmuh_h(rounded, quant);
+  qcoeff = __lsx_vadd_h(rounded, qcoeff);
+  qcoeff = __lsx_vmuh_h(qcoeff, shift);
+  qcoeff = __lsx_vsigncov_h(coeff, qcoeff);
+  qcoeff = __lsx_vand_v(qcoeff, cmp_mask);
+
+  return qcoeff;
+}
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+                                               int16_t *dqcoeff) {
+  __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);
+  __lsx_vst(dqcoeff16, dqcoeff, 0);
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
+                                                     __m128i dequant,
+                                                     int16_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;
+  __m128i zero = __lsx_vldi(0);
+  __m128i coeff = __lsx_vabsd_h(qcoeff, zero);
+
+  const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);
+  const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);
+
+  low = __lsx_vmul_h(coeff, dequant);
+  high = __lsx_vmuh_h(coeff, dequant);
+  dqcoeff32_0 = __lsx_vilvl_h(high, low);
+  dqcoeff32_1 = __lsx_vilvh_h(high, low);
+
+  // "Divide" by 2.
+  dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);
+  dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);
+  dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);
+  dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);
+  res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);
+  __lsx_vst(res, dqcoeff, 0);
+}
+
+static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
+                                   const int16_t *scan, int index,
+                                   __m128i zero) {
+  const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
+  const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);
+  __m128i scan0 = __lsx_vld(scan + index, 0);
+  __m128i scan1 = __lsx_vld(scan + index + 8, 0);
+  __m128i eob0, eob1;
+
+  eob0 = __lsx_vandn_v(zero_coeff0, scan0);
+  eob1 = __lsx_vandn_v(zero_coeff1, scan1);
+  return __lsx_vmax_h(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  int16_t res_m;
+
+  eob_shuffled = __lsx_vshuf4i_w(eob, 0xe);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  eob_shuffled = __lsx_vshuf4i_h(eob, 0xe);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  eob_shuffled = __lsx_vshuf4i_h(eob, 0x1);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  res_m = __lsx_vpickve2gr_h(eob, 1);
+
+  return res_m;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                        const struct macroblock_plane *const mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order) {
+  __m128i zero = __lsx_vldi(0);
+  int index = 16;
+  const int16_t *iscan = scan_order->iscan;
+
+  __m128i zbin, round, quant, dequant, quant_shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  zbin = __lsx_vld(mb_plane->zbin, 0);
+  round = __lsx_vld(mb_plane->round, 0);
+  quant = __lsx_vld(mb_plane->quant, 0);
+  dequant = __lsx_vld(dequant_ptr, 0);
+  quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
+  // Handle one DC and first 15 AC.
+  DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+  qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+  qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+  cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+  zbin = __lsx_vilvh_d(zbin, zbin);
+  cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+  qcoeff0 =
+      calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+  round = __lsx_vilvh_d(round, round);
+  quant = __lsx_vilvh_d(quant, quant);
+  quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+  qcoeff1 =
+      calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+  __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+  __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = __lsx_vilvh_d(dequant, dequant);
+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = __lsx_vld(coeff_ptr + index, 0);
+    coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+    qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+    qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+    cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+    cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+    qcoeff0 =
+        calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+    qcoeff1 =
+        calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+    __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+    __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
+    eob = __lsx_vmax_h(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const struct ScanOrder *const scan_order) {
+  __m128i zero = __lsx_vldi(0);
+  int index;
+  const int16_t *iscan = scan_order->iscan;
+
+  __m128i zbin, round, quant, dequant, quant_shift;
+  __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
+  __m128i eob = zero, eob0;
+
+  zbin = __lsx_vld(mb_plane->zbin, 0);
+  zbin = __lsx_vsrari_h(zbin, 1);
+  round = __lsx_vld(mb_plane->round, 0);
+  round = __lsx_vsrari_h(round, 1);
+
+  quant = __lsx_vld(mb_plane->quant, 0);
+  dequant = __lsx_vld(dequant_ptr, 0);
+  quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
+  quant_shift = __lsx_vslli_h(quant_shift, 1);
+  // Handle one DC and first 15 AC.
+  DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+  qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+  qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+  cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+  // remove DC from zbin
+  zbin = __lsx_vilvh_d(zbin, zbin);
+  cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+  qcoeff0 =
+      calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+  // remove DC in quant_shift, quant, quant_shift
+  round = __lsx_vilvh_d(round, round);
+  quant = __lsx_vilvh_d(quant, quant);
+  quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+  qcoeff1 =
+      calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+  __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+  __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+  calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = __lsx_vilvh_d(dequant, dequant);
+  calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
+  eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = __lsx_vld(coeff_ptr + index, 0);
+    coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+    qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+    qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+    cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+    cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+    qcoeff0 =
+        calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+    qcoeff1 =
+        calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+    __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+    __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
+                                      dqcoeff_ptr + 8 + index);
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
+    eob = __lsx_vmax_h(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+#endif
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
new file mode 100644
index 0000000000..b6fbedb0d0
--- /dev/null
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -0,0 +1,717 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
+                                 __m128i ref1) {
+  __m128i diff0_m, diff1_m, sad_m0;
+  __m128i sad_m = __lsx_vldi(0);
+
+  diff0_m = __lsx_vabsd_bu(in0, ref0);
+  diff1_m = __lsx_vabsd_bu(in1, ref1);
+
+  sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
+  sad_m = __lsx_vadd_h(sad_m, sad_m0);
+  sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
+  sad_m = __lsx_vadd_h(sad_m, sad_m0);
+
+  return sad_m;
+}
+
+static INLINE uint32_t hadd_uw_u32(__m128i in) {
+  __m128i res0_m;
+  uint32_t sum_m;
+
+  res0_m = __lsx_vhaddw_du_wu(in, in);
+  res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
+  sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+  return sum_m;
+}
+
+static INLINE uint32_t hadd_uh_u32(__m128i in) {
+  __m128i res_m;
+  uint32_t sum_m;
+
+  res_m = __lsx_vhaddw_wu_hu(in, in);
+  sum_m = hadd_uw_u32(res_m);
+
+  return sum_m;
+}
+
+static INLINE int32_t hadd_sw_s32(__m128i in) {
+  __m128i res0_m;
+  int32_t sum_m;
+
+  res0_m = __lsx_vhaddw_d_w(in, in);
+  res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
+  sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+  return sum_m;
+}
+
+static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t res;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
+    src += src_stride;
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+              src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 2);
+  uint32_t res;
+  __m128i src0, src1, ref0, ref1, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+  int32_t src_stride2 = src_stride << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+    src += src_stride2;
+    ref += ref_stride2;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+    src += src_stride2;
+    ref += ref_stride2;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 2);
+  uint32_t res;
+  __m128i src0, src1, ref0, ref1;
+  __m128i sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 1);
+  uint32_t sad = 0;
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+  }
+
+  sad = hadd_uh_u32(sad0);
+  sad += hadd_uh_u32(sad1);
+
+  return sad;
+}
+
+static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
+  int32_t ht_cnt = (height >> 2);
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  __m128i src0, src1, src2, src3, sad_tmp;
+  __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    src0 = __lsx_vld(src_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_ptr, src_stride3);
+    src_ptr += src_stride4;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1,
+              ref2);
+    ref3 = __lsx_vldx(ref0_ptr, ref_stride3);
+    ref0_ptr += ref_stride4;
+    ref4 = __lsx_vld(ref1_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5,
+              ref6);
+    ref7 = __lsx_vldx(ref1_ptr, ref_stride3);
+    ref1_ptr += ref_stride4;
+    ref8 = __lsx_vld(ref2_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9,
+              ref10);
+    ref11 = __lsx_vldx(ref2_ptr, ref_stride3);
+    ref2_ptr += ref_stride4;
+    ref12 = __lsx_vld(ref3_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13,
+              ref14);
+    ref15 = __lsx_vldx(ref3_ptr, ref_stride3);
+    ref3_ptr += ref_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  int32_t ht_cnt = (height >> 1);
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    ref0_ptr += ref_stride;
+    ref1 = __lsx_vld(ref1_ptr, 0);
+    ref1_ptr += ref_stride;
+    ref2 = __lsx_vld(ref2_ptr, 0);
+    ref2_ptr += ref_stride;
+    ref3 = __lsx_vld(ref3_ptr, 0);
+    ref3_ptr += ref_stride;
+
+    diff = __lsx_vabsd_bu(src, ref0);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref1);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref2);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref3);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    ref0_ptr += ref_stride;
+    ref1 = __lsx_vld(ref1_ptr, 0);
+    ref1_ptr += ref_stride;
+    ref2 = __lsx_vld(ref2_ptr, 0);
+    ref2_ptr += ref_stride;
+    ref3 = __lsx_vld(ref3_ptr, 0);
+    ref3_ptr += ref_stride;
+
+    diff = __lsx_vabsd_bu(src, ref0);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref1);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref2);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref3);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt = height;
+  __m128i src0, src1, ref0, ref1, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+
+    DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
+    ref0_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
+    ref1_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
+    ref2_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
+    ref3_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i sad, sad_tmp;
+
+  __m128i sad0_0 = __lsx_vldi(0);
+  __m128i sad0_1 = sad0_0;
+  __m128i sad1_0 = sad0_0;
+  __m128i sad1_1 = sad0_0;
+  __m128i sad2_0 = sad0_0;
+  __m128i sad2_1 = sad0_0;
+  __m128i sad3_0 = sad0_0;
+  __m128i sad3_1 = sad0_0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref0_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref1_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref2_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref3_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
+  }
+  sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[0] = hadd_uw_u32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[1] = hadd_uw_u32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[2] = hadd_uw_u32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[3] = hadd_uw_u32(sad);
+}
+
+static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i comp0, comp1, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+  uint8_t *src_tmp, *ref_tmp;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  for (; ht_cnt--;) {
+    src_tmp = (uint8_t *)src + 16;
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src1 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp, src_stride3);
+    src += src_stride4;
+
+    ref_tmp = (uint8_t *)ref + 16;
+    ref0 = __lsx_vld(ref, 0);
+    DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4);
+    ref6 = __lsx_vldx(ref, ref_stride3);
+    ref1 = __lsx_vld(ref_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3,
+              ref5);
+    ref7 = __lsx_vldx(ref_tmp, ref_stride3);
+    ref += ref_stride4;
+
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96,
+              pred0, pred2, pred4, pred6);
+    DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred,
+              112, pred1, pred3, pred5, pred7);
+    sec_pred += 128;
+
+    DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
+  __m128i sad, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+  }
+  sad = __lsx_vhaddw_wu_hu(sad0, sad0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+
+  res = hadd_sw_s32(sad);
+  return res;
+}
+
+#define VPX_SAD_8xHT_LSX(height)                                             \
+  uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_8width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_16xHT_LSX(height)                                             \
+  uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_16width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_32xHT_LSX(height)                                             \
+  uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_32width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_64xHT_LSX(height)                                             \
+  uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_64width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_8xHTx4D_LSX(height)                                       \
+  void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
+    sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_16xHTx4D_LSX(height)                                       \
+  void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_32xHTx4D_LSX(height)                                       \
+  void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_64xHTx4D_LSX(height)                                       \
+  void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_AVGSAD_32xHT_LSX(height)                                    \
+  uint32_t vpx_sad32x##height##_avg_lsx(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define VPX_AVGSAD_64xHT_LSX(height)                                    \
+  uint32_t vpx_sad64x##height##_avg_lsx(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define SAD64                                                             \
+  VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \
+      VPX_AVGSAD_64xHT_LSX(64)
+
+SAD64
+
+#define SAD32                                                             \
+  VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \
+      VPX_AVGSAD_32xHT_LSX(32)
+
+SAD32
+
+#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
+
+SAD16
+
+#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
+
+SAD8
+
+#undef SAD64
+#undef SAD32
+#undef SAD16
+#undef SAD8
diff --git a/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
new file mode 100644
index 0000000000..700793531c
--- /dev/null
+++ b/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
@@ -0,0 +1,874 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_lsx[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t res, ht_cnt = 32;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i pred0, pred1, pred2, pred3, vec, vec_tmp;
+  __m128i avg0, avg1, avg2, avg3;
+  __m128i var = __lsx_vldi(0);
+
+  avg0 = var;
+  avg1 = var;
+  avg2 = var;
+  avg3 = var;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+              pred3, src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+              pred3, src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+  vec = __lsx_vhaddw_w_h(avg0, avg0);
+  vec_tmp = __lsx_vhaddw_w_h(avg1, avg1);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  vec_tmp = __lsx_vhaddw_w_h(avg2, avg2);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  vec_tmp = __lsx_vhaddw_w_h(avg3, avg3);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i vec0, vec1, vec2, vec3, filt0, out, vec;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1,
+              FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS,
+              src0, src1, src2, src3);
+    out = __lsx_vpackev_d(src1, src0);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = __lsx_vpackev_d(src3, src2);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, filt0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i vec, var = __lsx_vldi(0);
+  __m128i avg = var;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, dst0, var, avg);
+    CALC_MSE_AVG_B(src1, dst1, var, avg);
+    CALC_MSE_AVG_B(src2, dst2, var, avg);
+    CALC_MSE_AVG_B(src3, dst3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+  __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    src0 = src4;
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+  __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i var = __lsx_vldi(0);
+  __m128i avg = var;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    src0 = src4;
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3);
+    src += src_stride;
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec;
+  __m128i var = __lsx_vldi(0);
+  __m128i avg = var;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+  HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    CALC_MSE_AVG_B(src2, ref2, var, avg);
+    CALC_MSE_AVG_B(src3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height,
+                                           &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height,
+                                           &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  __m128i pred0, pred1, pred2, pred3, filt0, vec;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i mask = { 0x403030202010100, 0x807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3,
+              pred3, tmp0, tmp1, tmp2, tmp3);
+
+    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+  __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1, vec, filt0;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    src += src_stride;
+    src2 = __lsx_vld(src, 0);
+    src += src_stride;
+    src3 = __lsx_vld(src, 0);
+    src += src_stride;
+    src4 = __lsx_vld(src, 0);
+    src += src_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    src0 = src4;
+    ref0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+              pred3, out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1;
+  __m128i mask = { 0x403030202010100, 0x807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+  HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    ref0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+              pred3, out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht)                              \
+  uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx(                           \
+      const uint8_t *src, int32_t src_stride, int32_t x_offset,               \
+      int32_t y_offset, const uint8_t *ref, int32_t ref_stride,               \
+      uint32_t *sse) {                                                        \
+    int32_t diff;                                                             \
+    uint32_t var;                                                             \
+    const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
+                                                                              \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_sse_diff_##wd##width_hv_lsx(                         \
+            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+      } else {                                                                \
+        *sse = sub_pixel_sse_diff_##wd##width_v_lsx(                          \
+            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
+      }                                                                       \
+                                                                              \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
+    } else {                                                                  \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_sse_diff_##wd##width_h_lsx(                          \
+            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
+                                                                              \
+        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
+      } else {                                                                \
+        var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return var;                                                               \
+  }
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32)
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht)                           \
+  uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx(                           \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
+      int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
+                                                                              \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_avg_sse_diff_64width_hv_lsx(                         \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_64width_v_lsx(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
+    } else {                                                                  \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_avg_sse_diff_64width_h_lsx(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr,       \
+                                          ref_stride, sec_pred, &diff);       \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
+  }
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64)
diff --git a/vpx_dsp/loongarch/subtract_lsx.c b/vpx_dsp/loongarch/subtract_lsx.c
new file mode 100644
index 0000000000..943a5c5a9b
--- /dev/null
+++ b/vpx_dsp/loongarch/subtract_lsx.c
@@ -0,0 +1,371 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3;
+  __m128i pred0, pred1, pred2, pred3;
+  __m128i diff0, diff1;
+  __m128i reg0, reg1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t diff_stride2 = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t diff_stride3 = diff_stride2 + diff_stride;
+
+  DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+            src2, src3);
+  DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+            pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2,
+            src0, src2, pred0, pred2);
+  DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0);
+  reg0 = __lsx_vilvl_b(src0, pred0);
+  reg1 = __lsx_vilvh_b(src0, pred0);
+  DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1);
+  __lsx_vstelm_d(diff0, diff_ptr, 0, 0);
+  __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1);
+  __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0);
+  __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1);
+}
+
+static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t dst_stride = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+            src2, src3);
+  DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+            pred1, pred2, pred3);
+  src_ptr += src_stride4;
+  pred_ptr += pred_stride4;
+
+  DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5,
+            src6, src7);
+  DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4,
+            pred5, pred6, pred7);
+
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  __lsx_vst(src0, diff_ptr, 0);
+  __lsx_vstx(src1, diff_ptr, dst_stride);
+  __lsx_vstx(src2, diff_ptr, dst_stride2);
+  __lsx_vstx(src3, diff_ptr, dst_stride3);
+  diff_ptr += dst_stride2;
+  __lsx_vst(src4, diff_ptr, 0);
+  __lsx_vstx(src5, diff_ptr, dst_stride);
+  __lsx_vstx(src6, diff_ptr, dst_stride2);
+  __lsx_vstx(src7, diff_ptr, dst_stride3);
+}
+
+static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t dst_stride = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+  int16_t *diff_tmp = diff + 8;
+
+  DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+            pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            pred, pred_stride, src5, src6, src7, pred5);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg2, reg4, reg6);
+  DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg1, reg3, reg5, reg7);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp0, tmp2, tmp4, tmp6);
+  DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp1, tmp3, tmp5, tmp7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+            pred0, pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+            pred4, pred5, pred6, pred7);
+  __lsx_vst(src0, diff, 0);
+  __lsx_vstx(src2, diff, dst_stride);
+  __lsx_vstx(src4, diff, dst_stride2);
+  __lsx_vstx(src6, diff, dst_stride3);
+  __lsx_vst(src1, diff_tmp, 0);
+  __lsx_vstx(src3, diff_tmp, dst_stride);
+  __lsx_vstx(src5, diff_tmp, dst_stride2);
+  __lsx_vstx(src7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  __lsx_vst(pred0, diff, 0);
+  __lsx_vstx(pred2, diff, dst_stride);
+  __lsx_vstx(pred4, diff, dst_stride2);
+  __lsx_vstx(pred6, diff, dst_stride3);
+  __lsx_vst(pred1, diff_tmp, 0);
+  __lsx_vstx(pred3, diff_tmp, dst_stride);
+  __lsx_vstx(pred5, diff_tmp, dst_stride2);
+  __lsx_vstx(pred7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+            pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            pred, pred_stride, src5, src6, src7, pred5);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg2, reg4, reg6);
+  DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg1, reg3, reg5, reg7);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp0, tmp2, tmp4, tmp6);
+  DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp1, tmp3, tmp5, tmp7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+            pred0, pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+            pred4, pred5, pred6, pred7);
+  __lsx_vst(src0, diff, 0);
+  __lsx_vstx(src2, diff, dst_stride);
+  __lsx_vstx(src4, diff, dst_stride2);
+  __lsx_vstx(src6, diff, dst_stride3);
+  __lsx_vst(src1, diff_tmp, 0);
+  __lsx_vstx(src3, diff_tmp, dst_stride);
+  __lsx_vstx(src5, diff_tmp, dst_stride2);
+  __lsx_vstx(src7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  __lsx_vst(pred0, diff, 0);
+  __lsx_vstx(pred2, diff, dst_stride);
+  __lsx_vstx(pred4, diff, dst_stride2);
+  __lsx_vstx(pred6, diff, dst_stride3);
+  __lsx_vst(pred1, diff_tmp, 0);
+  __lsx_vstx(pred3, diff_tmp, dst_stride);
+  __lsx_vstx(pred5, diff_tmp, dst_stride2);
+  __lsx_vstx(pred7, diff_tmp, dst_stride3);
+}
+
+static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  uint32_t loop_cnt;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    const uint8_t *src_tmp = src + 16;
+    const uint8_t *pred_tmp = pred + 16;
+    DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1,
+              pred0, pred1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+    DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred,
+              pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3);
+    DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred,
+              pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7);
+    DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg1, reg3, reg5, reg7);
+    DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+              reg3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+              reg7, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+              tmp3, pred0, pred1, pred2, pred3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+              tmp7, pred4, pred5, pred6, pred7);
+    src += src_stride4;
+    pred += pred_stride4;
+    __lsx_vst(src0, diff, 0);
+    __lsx_vst(src1, diff, 16);
+    __lsx_vst(src2, diff, 32);
+    __lsx_vst(src3, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(src4, diff, 0);
+    __lsx_vst(src5, diff, 16);
+    __lsx_vst(src6, diff, 32);
+    __lsx_vst(src7, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(pred0, diff, 0);
+    __lsx_vst(pred1, diff, 16);
+    __lsx_vst(pred2, diff, 32);
+    __lsx_vst(pred3, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(pred4, diff, 0);
+    __lsx_vst(pred5, diff, 16);
+    __lsx_vst(pred6, diff, 32);
+    __lsx_vst(pred7, diff, 48);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  uint32_t loop_cnt;
+
+  for (loop_cnt = 32; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1,
+              pred2, pred3);
+    src += src_stride;
+    pred += pred_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+              src7);
+    DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5,
+              pred6, pred7);
+    src += src_stride;
+    pred += pred_stride;
+
+    DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg1, reg3, reg5, reg7);
+    DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+              reg3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+              reg7, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+              tmp3, pred0, pred1, pred2, pred3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+              tmp7, pred4, pred5, pred6, pred7);
+    __lsx_vst(src0, diff, 0);
+    __lsx_vst(src1, diff, 16);
+    __lsx_vst(src2, diff, 32);
+    __lsx_vst(src3, diff, 48);
+    __lsx_vst(src4, diff, 64);
+    __lsx_vst(src5, diff, 80);
+    __lsx_vst(src6, diff, 96);
+    __lsx_vst(src7, diff, 112);
+    diff += diff_stride;
+    __lsx_vst(pred0, diff, 0);
+    __lsx_vst(pred1, diff, 16);
+    __lsx_vst(pred2, diff, 32);
+    __lsx_vst(pred3, diff, 48);
+    __lsx_vst(pred4, diff, 64);
+    __lsx_vst(pred5, diff, 80);
+    __lsx_vst(pred6, diff, 96);
+    __lsx_vst(pred7, diff, 112);
+    diff += diff_stride;
+  }
+}
+
+void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr,
+                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                            ptrdiff_t pred_stride) {
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 8:
+        sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 16:
+        sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 32:
+        sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 64:
+        sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      default:
+        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+                             src_stride, pred_ptr, pred_stride);
+        break;
+    }
+  } else {
+    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+                         pred_ptr, pred_stride);
+  }
+}
diff --git a/vpx_dsp/loongarch/txfm_macros_lsx.h b/vpx_dsp/loongarch/txfm_macros_lsx.h
new file mode 100644
index 0000000000..bd514831bf
--- /dev/null
+++ b/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1)         \
+  do {                                                                \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m;                       \
+    __m128i k0_m, k1_m, k2_m, k3_m;                                   \
+                                                                      \
+    k0_m = __lsx_vreplgr2vr_h(cnst0);                                 \
+    k1_m = __lsx_vreplgr2vr_h(cnst1);                                 \
+    k2_m = __lsx_vpackev_h(k1_m, k0_m);                               \
+                                                                      \
+    DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m);     \
+    DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m);     \
+                                                                      \
+    DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \
+    k3_m = __lsx_vmulwod_w_h(s5_m, k1_m);                             \
+    s1_m = __lsx_vsub_w(s1_m, k3_m);                                  \
+    k3_m = __lsx_vmulwod_w_h(s4_m, k1_m);                             \
+    s0_m = __lsx_vsub_w(s0_m, k3_m);                                  \
+                                                                      \
+    out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);            \
+                                                                      \
+    DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m);    \
+    out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);            \
+  } while (0)
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3)                \
+  do {                                                           \
+    __m128i tp0_m, tp1_m;                                        \
+                                                                 \
+    DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \
+    in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS);      \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
diff --git a/vpx_dsp/loongarch/variance_lsx.c b/vpx_dsp/loongarch/variance_lsx.c
new file mode 100644
index 0000000000..8fad342c71
--- /dev/null
+++ b/vpx_dsp/loongarch/variance_lsx.c
@@ -0,0 +1,263 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0,
+              src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+              src2, src3);
+    src_ptr += src_stride4;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0,
+              ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1,
+              ref2, ref3);
+    ref_ptr += ref_stride4;
+
+    DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+              src0, src1, ref0, ref1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src, ref, vec;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  for (; ht_cnt--;) {
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i avg = __lsx_vldi(0);
+  __m128i src0, src1, ref0, ref1;
+  __m128i vec;
+  __m128i var = avg;
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t res, ht_cnt = 32;
+  __m128i avg0 = __lsx_vldi(0);
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i vec0, vec1;
+  __m128i avg1 = avg0;
+  __m128i avg2 = avg0;
+  __m128i avg3 = avg0;
+  __m128i var = avg0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+  vec0 = __lsx_vhaddw_w_h(avg0, avg0);
+  vec1 = __lsx_vhaddw_w_h(avg1, avg1);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  vec1 = __lsx_vhaddw_w_h(avg2, avg2);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  vec1 = __lsx_vhaddw_w_h(avg3, avg3);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  HADD_SW_S32(vec0, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_VARIANCE_WDXHT_LSX(wd, ht)                                         \
+  uint32_t vpx_variance##wd##x##ht##_lsx(                                      \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
+      int32_t ref_stride, uint32_t *sse) {                                     \
+    int32_t diff;                                                              \
+                                                                               \
+    *sse =                                                                     \
+        sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \
+                                                                               \
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
+  }
+
+static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src, ref;
+  __m128i var = __lsx_vldi(0);
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+  }
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+VPX_VARIANCE_WDXHT_LSX(8, 8)
+VPX_VARIANCE_WDXHT_LSX(16, 16)
+VPX_VARIANCE_WDXHT_LSX(32, 32)
+
+uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride,
+                          const uint8_t *ref, int32_t ref_stride,
+                          uint32_t *sse) {
+  *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                         int32_t *sum) {
+  *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum);
+}
diff --git a/vpx_dsp/loongarch/variance_lsx.h b/vpx_dsp/loongarch/variance_lsx.h
new file mode 100644
index 0000000000..cf9e9890ff
--- /dev/null
+++ b/vpx_dsp/loongarch/variance_lsx.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define HADD_SW_S32(in0, in1)                  \
+  do {                                         \
+    __m128i res0_m;                            \
+                                               \
+    res0_m = __lsx_vhaddw_d_w(in0, in0);       \
+    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
+    in1 = __lsx_vpickve2gr_w(res0_m, 0);       \
+  } while (0)
+
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \
+  do {                                                        \
+    __m128i tmp0_m, tmp1_m;                                   \
+                                                              \
+    tmp0_m = __lsx_vshuf_b(in1, in0, mask);                   \
+    tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);                  \
+    in2 = __lsx_vsrari_h(tmp1_m, shift);                      \
+  } while (0)
+
+#define CALC_MSE_B(src, ref, var)                                         \
+  do {                                                                    \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+  } while (0)
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                                \
+  do {                                                                    \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+    sub = __lsx_vadd_h(sub, res_l0_m);                                    \
+    sub = __lsx_vadd_h(sub, res_l1_m);                                    \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
new file mode 100644
index 0000000000..1c59228813
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
@@ -0,0 +1,972 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1;
+  __m128i dst0, dst1, dst2, dst3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp0, tmp1);
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+  tmp0 = __lsx_vxori_b(tmp0, 128);
+  dst0 = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(dst0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+  dst0 = __lsx_vilvl_d(tmp1, tmp0);
+
+  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+  dst1 = __lsx_vilvl_d(tmp1, tmp0);
+
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp0, tmp1);
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp2, tmp3);
+  DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
+            tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+  __lsx_vstelm_w(dst0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 3);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  int32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, tmp0,
+                               tmp1, tmp2, tmp3);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height >> 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    dst0 = __lsx_vld(dst_tmp, 0);
+    dst1 = __lsx_vldx(dst_tmp, dst_stride);
+    dst_tmp += dst_stride2;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+              mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+              mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+              mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+              mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+              filter0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+              tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+              tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+              tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+              tmp7);
+    DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
+    DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
+    DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst1, dst, dst_stride);
+    dst += dst_stride2;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, dst0, dst1;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+              mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+              mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+              mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+              mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+              filter0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+              tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+              tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+              tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+              tmp7);
+    DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3, dst0, dst1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+    src3 = __lsx_vld(src, 56);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+    __lsx_vst(out0, dst, 32);
+    __lsx_vst(out1, dst, 48);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1);
+  vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS);
+  vec0 = __lsx_vavgr_bu(vec0, dst0);
+  __lsx_vstelm_w(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 3);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i vec4, vec5, vec6, vec7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp1 = (uint8_t *)src + src_stride4;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+
+  src4 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+            src6);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+            res1, res2, res3);
+  DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2);
+  DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 3);
+  dst += dst_stride;
+}
+
+static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec1);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec1, dst, 0, 1);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec2);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 1);
+  dst += dst_stride;
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec2);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 1);
+  dst += dst_stride;
+
+  if (height == 16) {
+    LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, vec0, vec2);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 1);
+    dst += dst_stride;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, vec0, vec2);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2) - 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp1 = (uint8_t *)src + 8;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+  src6 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  src1 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+            src5);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+  src_tmp1 += src_stride4;
+
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+            res4, res5, res6, res7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+            FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0,
+            res2, res4, res6);
+  dst0 = __lsx_vld(dst, 0);
+  res0 = __lsx_vavgr_bu(res0, dst0);
+  __lsx_vst(res0, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res2 = __lsx_vavgr_bu(res2, dst0);
+  __lsx_vst(res2, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res4 = __lsx_vavgr_bu(res4, dst0);
+  __lsx_vst(res4, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res6 = __lsx_vavgr_bu(res6, dst0);
+  __lsx_vst(res6, dst, 0);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src_tmp1 += src_stride4;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, res0, res1, res2, res3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, res4, res5, res6, res7);
+
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+              FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+              res0, res2, res4, res6);
+    dst0 = __lsx_vld(dst, 0);
+    res0 = __lsx_vavgr_bu(res0, dst0);
+    __lsx_vst(res0, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res2 = __lsx_vavgr_bu(res2, dst0);
+    __lsx_vst(res2, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res4 = __lsx_vavgr_bu(res4, dst0);
+    __lsx_vst(res4, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res6 = __lsx_vavgr_bu(res6, dst0);
+    __lsx_vst(res6, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0, dst1;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, res0, res1, res2, res3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, res4, res5, res6, res7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+              FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+              res0, res2, res4, res6);
+
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    res0 = __lsx_vavgr_bu(res0, dst0);
+    __lsx_vst(res0, dst, 0);
+    res2 = __lsx_vavgr_bu(res2, dst1);
+    __lsx_vst(res2, dst, 16);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    res4 = __lsx_vavgr_bu(res4, dst0);
+    __lsx_vst(res4, dst, 0);
+    res6 = __lsx_vavgr_bu(res6, dst1);
+    __lsx_vst(res6, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+              src6);
+    src7 = __lsx_vld(src, 56);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out2, out4, out6);
+
+    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2,
+              dst3);
+    out0 = __lsx_vavgr_bu(out0, dst0);
+    __lsx_vst(out0, dst, 0);
+    out2 = __lsx_vavgr_bu(out2, dst1);
+    __lsx_vst(out2, dst, 16);
+    out4 = __lsx_vavgr_bu(out4, dst2);
+    __lsx_vst(out4, dst, 32);
+    out6 = __lsx_vavgr_bu(out6, dst3);
+    __lsx_vst(out6, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    switch (w) {
+      case 4:
+        common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+
+      case 32:
+        common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
new file mode 100644
index 0000000000..d1abf622ad
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -0,0 +1,737 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i out0, out1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    src2 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src4 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src5 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
+    src2 = __lsx_vilvl_d(src3, src2);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+    tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+    src0 = __lsx_vpackev_b(src1, src0);
+    out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS);
+    out0 = __lsx_vxori_b(out0, 128);
+    out0 = __lsx_vavgr_bu(out0, src2);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    tmp5 = src1;
+    tmp0 = tmp2;
+    tmp1 = tmp4;
+    tmp2 = src0;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  __m128i out0, out1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+            tmp0, tmp1, tmp2, tmp4);
+  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp3 = __lsx_vpackev_b(src7, src6);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vpackev_b(src8, src7);
+    out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src1 = __lsx_vpackev_b(src9, src8);
+    src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+    src2 = __lsx_vpackev_b(src10, src9);
+    src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3,
+              FILTER_BITS, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    src5 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src7 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src8 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src9 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    src6 = src10;
+    tmp0 = tmp2;
+    tmp1 = tmp3;
+    tmp2 = src1;
+    tmp4 = tmp6;
+    tmp5 = src0;
+    tmp6 = src2;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                        filter_horiz, filter_vert, height);
+  src += 8;
+  dst += 8;
+
+  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                        filter_horiz, filter_vert, height);
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1;
+  __m128i dst0, dst1, dst2, dst3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+
+  dst0 = __lsx_vldrepl_w(dst, 0);
+  dst1 = __lsx_vldrepl_w(dst + dst_stride, 0);
+  dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0);
+  dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+  tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += src_stride4;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+  hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+  hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+  DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+            hz_out1, hz_out3);
+  hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+  hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst1 = __lsx_vilvl_w(dst2, dst1);
+  dst2 = __lsx_vilvl_w(dst4, dst3);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+            hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+            filt_vt, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, res0, res1);
+  DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else if (height == 8) {
+    common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *dst_tmp = dst;
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+  vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+  hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+  vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+  vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+  AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *dst_tmp = dst;
+
+  /* rearranging filter */
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp1);
+
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else {
+    common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  uint8_t *src_tmp1;
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride << 2;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src_tmp1 = (uint8_t *)(src + 8);
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src += src_stride4;
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst0);
+    __lsx_vst(tmp3, dst, 0);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst1);
+    __lsx_vstx(tmp3, dst, dst_stride);
+
+    hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst2);
+    __lsx_vstx(tmp3, dst, dst_stride2);
+
+    hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst3);
+    __lsx_vstx(tmp3, dst, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                         filter_horiz, filter_vert, height);
+  src += 16;
+  dst += 16;
+
+  common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                         filter_horiz, filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
new file mode 100644
index 0000000000..5c6413df44
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
@@ -0,0 +1,918 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i reg0, reg1, reg2, reg3, reg4;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  src0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+            src2);
+  src3 = __lsx_vldx(src_tmp0, src_stride3);
+  src_tmp0 += src_stride4;
+  src4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+            src6);
+  src_tmp0 += src_stride3;
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+  reg2 = __lsx_vilvl_d(tmp5, tmp2);
+  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+  reg2 = __lsx_vxori_b(reg2, 128);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src_tmp0, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+              src9);
+    src10 = __lsx_vldx(src_tmp0, src_stride3);
+    src_tmp0 += src_stride4;
+    src0 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src1 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src2 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1);
+    src0 = __lsx_vilvl_d(src1, src0);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+    DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+                               filter2, filter3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    out0 = __lsx_vavgr_bu(out0, src0);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+    reg0 = reg2;
+    reg1 = reg3;
+    reg2 = reg4;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+            src2);
+  src3 = __lsx_vldx(src_tmp0, src_stride3);
+  src_tmp0 += src_stride4;
+  src4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+            src6);
+  src_tmp0 += src_stride3;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src_tmp0, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+              src9);
+    src10 = __lsx_vldx(src_tmp0, src_stride3);
+    src_tmp0 += src_stride4;
+    src0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+                               filter2, filter3);
+    out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+                               filter2, filter3);
+    out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+    reg0 = reg2;
+    reg1 = tmp0;
+    reg2 = tmp2;
+    reg3 = reg5;
+    reg4 = tmp1;
+    reg5 = tmp3;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter, int32_t height, int32_t width) {
+  uint8_t *src_tmp;
+  uint32_t cnt = width >> 4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; cnt--;) {
+    uint32_t loop_cnt = height >> 2;
+    uint8_t *dst_reg = dst;
+
+    src_tmp = src_tmp0;
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_tmp, src_stride3);
+    src_tmp += src_stride4;
+    src4 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+              src6);
+    src_tmp += src_stride3;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+    src6 = __lsx_vxori_b(src6, 128);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg6, reg7, reg8, reg9);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+    for (; loop_cnt--;) {
+      src7 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+                src9);
+      src10 = __lsx_vldx(src_tmp, src_stride3);
+      src_tmp += src_stride4;
+      DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+                src7, src8, src9, src10);
+      DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src0, src1, src2, src3);
+      DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src4, src5, src7, src8);
+      tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      tmp2 = __lsx_vld(dst_reg, 0);
+      tmp3 = __lsx_vldx(dst_reg, dst_stride);
+      DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+      __lsx_vst(tmp0, dst_reg, 0);
+      __lsx_vstx(tmp1, dst_reg, dst_stride);
+      tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      tmp2 = __lsx_vldx(dst_reg, dst_stride2);
+      tmp3 = __lsx_vldx(dst_reg, dst_stride3);
+      DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+      __lsx_vstx(tmp0, dst_reg, dst_stride2);
+      __lsx_vstx(tmp1, dst_reg, dst_stride3);
+      dst_reg += dst_stride4;
+
+      reg0 = reg2;
+      reg1 = src0;
+      reg2 = src2;
+      reg3 = reg5;
+      reg4 = src1;
+      reg5 = src3;
+      reg6 = reg8;
+      reg7 = src4;
+      reg8 = src7;
+      reg9 = reg11;
+      reg10 = src5;
+      reg11 = src8;
+      src6 = src10;
+    }
+    src_tmp0 += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+  __m128i src10_r, src32_r, src21_r, src43_r;
+  __m128i tmp0, tmp1;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+            src4332);
+  DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+  out = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(out, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 3);
+  dst += dst_stride;
+}
+
+static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src2110, src4332, src6554, src8776, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src7 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src8 = __lsx_vld(src, 0);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst1 = __lsx_vilvl_w(dst2, dst1);
+  dst2 = __lsx_vilvl_w(dst4, dst3);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+            src54_r, src65_r, src76_r, src87_r);
+  DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+            src87_r, src76_r, src2110, src4332, src6554, src8776);
+  DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0,
+            src8776, filt0, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(tmp2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 3);
+}
+
+static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1);
+  DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp2, dst, 0, 1);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst4 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst5 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3);
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  uint8_t *src_tmp1;
+  uint8_t *dst_tmp1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+
+    src_tmp1 = src + 16;
+    src6 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7,
+              src8);
+    src9 = __lsx_vldx(src_tmp1, src_stride3);
+
+    dst_tmp1 = dst + 16;
+    dst4 = __lsx_vld(dst_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5,
+              dst6);
+    dst7 = __lsx_vldx(dst_tmp1, dst_stride3);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vstx(tmp0, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vstx(tmp0, dst, dst_stride2);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vstx(tmp0, dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+    dst += dst_stride;
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  int32_t src_stride2 = src_stride << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *src_tmp1;
+  uint8_t *dst_tmp1;
+  __m128i src0, src1, src2, src3, src4, src5;
+  __m128i src6, src7, src8, src9, src10, src11, filt0;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+            src9);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src2 = __lsx_vldx(src, src_stride);
+    dst1 = __lsx_vldx(dst, dst_stride);
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+              src10);
+    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4,
+              dst6);
+    src_tmp1 = (uint8_t *)src + 16;
+    src5 = __lsx_vldx(src_tmp1, src_stride);
+    src_tmp1 = src_tmp1 + 16;
+    src8 = __lsx_vldx(src_tmp1, src_stride);
+    src_tmp1 = src_tmp1 + 16;
+    src11 = __lsx_vldx(src_tmp1, src_stride);
+
+    dst_tmp1 = dst + 16;
+    dst3 = __lsx_vldx(dst_tmp1, dst_stride);
+    dst_tmp1 = dst + 32;
+    dst5 = __lsx_vldx(dst_tmp1, dst_stride);
+    dst_tmp1 = dst + 48;
+    dst7 = __lsx_vldx(dst_tmp1, dst_stride);
+    src += src_stride2;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vstx(tmp0, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vst(tmp0, dst, 16);
+
+    dst_tmp1 = dst + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+    __lsx_vst(tmp0, dst, 32);
+
+    dst_tmp1 = dst_tmp1 + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+    __lsx_vst(tmp0, dst, 48);
+
+    dst_tmp1 = dst_tmp1 + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+    dst += dst_stride2;
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+
+        break;
+      case 32:
+        common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
new file mode 100644
index 0000000000..2c6459a978
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
@@ -0,0 +1,814 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out, out0, out1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1);
+  out = __lsx_vssrarni_b_h(out1, out0, 7);
+  out = __lsx_vxori_b(out, 128);
+  __lsx_vstelm_w(out, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 3);
+}
+
+static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1);
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1,
+                             out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+}
+
+static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 1;
+  int32_t stride = src_stride << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    const uint8_t *_src = src + src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
+    DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out1, dst, 0);
+    dst += dst_stride;
+    src += stride;
+  }
+}
+
+static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+
+    dst += dst_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+    src3 = __lsx_vld(src, 56);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 32);
+    __lsx_vst(out1, dst, 48);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, res0, res1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3,
+            FILTER_BITS, res0, res1);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i res0, res1, res2, res3, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  uint8_t *src_tmp1 = src + src_stride4;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+            src6);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+  DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
+            src7, src6, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+            res1, res2, res3);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 1);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res2, dst, 0, 0);
+  __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i filt0, mask;
+  __m128i src0, src1, src2, src3;
+  __m128i vec0, vec1, vec2, vec3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec1);
+
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  __m128i filt0, mask;
+  __m128i src0, src1, src2, src3, out0, out1;
+  __m128i vec0, vec1, vec2, vec3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+  dst += dst_stride;
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+  dst += dst_stride;
+
+  if (height == 16) {
+    uint8_t *dst_tmp1 = dst + dst_stride4;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst_tmp1, 0, 0);
+    __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1);
+  }
+}
+
+static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2) - 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *src_tmp1 = src + 8;
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+  src6 = __lsx_vldx(src, src_stride3);
+  src1 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+            src5);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask,
+            src7, src7, mask, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            out0, out1, out2, out3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+            out4, out5, out6, out7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+            FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0,
+            out1, out2, out3);
+
+  __lsx_vst(out0, dst, 0);
+  dst += dst_stride;
+  __lsx_vst(out1, dst, 0);
+  dst += dst_stride;
+  __lsx_vst(out2, dst, 0);
+  dst += dst_stride;
+  __lsx_vst(out3, dst, 0);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    src_tmp1 += src_stride4;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out1, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out2, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out3, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6);
+    src7 = __lsx_vld(src, 24);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    dst += dst_stride;
+
+    __lsx_vst(out2, dst, 0);
+    __lsx_vst(out3, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+              src6);
+    src7 = __lsx_vld(src, 56);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    __lsx_vst(out2, dst, 32);
+    __lsx_vst(out3, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    switch (w) {
+      case 4:
+        common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+
+      case 16:
+        common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+
+      case 32:
+        common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+
+      case 64:
+        common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
new file mode 100644
index 0000000000..9f5cd6cfe9
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -0,0 +1,697 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i out0, out1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= (3 + 3 * src_stride);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+  src5 = __lsx_vld(src, 0);
+  src += src_stride;
+  src6 = __lsx_vld(src, 0);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+
+  for (; loop_cnt--;) {
+    LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+    tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+    src0 = __lsx_vpackev_b(src1, src0);
+    out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    tmp5 = src1;
+    tmp0 = tmp2;
+    tmp1 = tmp4;
+    tmp2 = src0;
+  }
+}
+
+static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  __m128i out0, out1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= (3 + 3 * src_stride);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+  src5 = __lsx_vld(src, 0);
+  src += src_stride;
+  src6 = __lsx_vld(src, 0);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+            tmp0, tmp1, tmp2, tmp4);
+  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+  for (; loop_cnt--;) {
+    LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp3 = __lsx_vpackev_b(src7, src6);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vpackev_b(src8, src7);
+    out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src1 = __lsx_vpackev_b(src9, src8);
+    src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+    src2 = __lsx_vpackev_b(src10, src9);
+    src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    src6 = src10;
+    tmp0 = tmp2;
+    tmp1 = tmp3;
+    tmp2 = src1;
+    tmp4 = tmp6;
+    tmp5 = src0;
+    tmp6 = src2;
+  }
+}
+
+static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  src += 8;
+  dst += 8;
+
+  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  src += 8;
+  dst += 8;
+}
+
+static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_vt, filt_hz, vec0, vec1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += src_stride4;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+  hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+  hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+
+  DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+            hz_out1, hz_out3);
+  hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+  hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+            hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+            filt_vt, vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4,
+            vec5, vec6, vec7);
+
+  __lsx_vstelm_w(vec4, dst, 0, 0);
+  __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1);
+  dst += dst_stride4;
+  __lsx_vstelm_w(vec6, dst, 0, 0);
+  __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else if (height == 8) {
+    common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+  vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+  hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+  vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+  vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
+                                          int32_t src_stride, uint8_t *dst,
+                                          int32_t dst_stride,
+                                          int8_t *filter_horiz,
+                                          int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0;
+  __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+              FILTER_BITS, tmp1, tmp2);
+
+    __lsx_vstelm_d(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp1, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+              FILTER_BITS, tmp1, tmp2);
+
+    __lsx_vstelm_d(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp1, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else {
+    common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1;
+  __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    uint8_t *src_tmp0 = src + 8;
+
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src,
+              src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
+    src += src_stride4;
+
+    hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                            filter_vert, height);
+  src += 16;
+  dst += 16;
+
+  common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                            filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
+                       int32_t y_step_q4, int32_t w, int32_t h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                    y0_q4, y_step_q4, w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
new file mode 100644
index 0000000000..6022e43c83
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -0,0 +1,825 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i reg0, reg1, reg2, reg3, reg4;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1;
+  uint8_t *_src = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+  reg2 = __lsx_vilvl_d(tmp5, tmp2);
+  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+  reg2 = __lsx_vxori_b(reg2, 128);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+    DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+                               filter2, filter3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = reg3;
+    reg2 = reg4;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  src = src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+                               filter2, filter3);
+    out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+                               filter2, filter3);
+    out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = tmp0;
+    reg2 = tmp2;
+    reg3 = reg5;
+    reg4 = tmp1;
+    reg5 = tmp3;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  // uint8_t *_src = (uint8_t *)src - src_stride3;
+  src -= src_stride3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6,
+            reg7, reg8, reg9);
+  DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              src4, src5, src7, src8);
+    tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+                               filter2, filter3);
+    tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+                               filter2, filter3);
+    tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+                               filter2, filter3);
+    tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(tmp1, dst, 0);
+    dst += dst_stride;
+    tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+                               filter2, filter3);
+    tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+                               filter2, filter3);
+    tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+                               filter2, filter3);
+    tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(tmp1, dst, 0);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = src0;
+    reg2 = src2;
+    reg3 = reg5;
+    reg4 = src1;
+    reg5 = src3;
+    reg6 = reg8;
+    reg7 = src4;
+    reg8 = src7;
+    reg9 = reg11;
+    reg10 = src5;
+    reg11 = src8;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter, int32_t height,
+                                      int32_t width) {
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t cnt = width >> 4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  src -= src_stride3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; cnt--;) {
+    uint32_t loop_cnt = height >> 2;
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_tmp, src_stride3);
+    src_tmp += src_stride4;
+    src4 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+              src6);
+    src_tmp += src_stride3;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+    src6 = __lsx_vxori_b(src6, 128);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg6, reg7, reg8, reg9);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+    for (; loop_cnt--;) {
+      src7 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+                src9);
+      src10 = __lsx_vldx(src_tmp, src_stride3);
+      src_tmp += src_stride4;
+      DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+                src7, src8, src9, src10);
+      DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src0, src1, src2, src3);
+      DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src4, src5, src7, src8);
+      tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      __lsx_vst(tmp0, dst_tmp, 0);
+      __lsx_vstx(tmp1, dst_tmp, dst_stride);
+      tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      __lsx_vstx(tmp0, dst_tmp, dst_stride2);
+      __lsx_vstx(tmp1, dst_tmp, dst_stride3);
+      dst_tmp += dst_stride4;
+
+      reg0 = reg2;
+      reg1 = src0;
+      reg2 = src2;
+      reg3 = reg5;
+      reg4 = src1;
+      reg5 = src3;
+      reg6 = reg8;
+      reg7 = src4;
+      reg8 = src7;
+      reg9 = reg11;
+      reg10 = src5;
+      reg11 = src8;
+      src6 = src10;
+    }
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+                            32);
+}
+
+static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+                            64);
+}
+
+static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+  __m128i filt0, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += (src_stride4 + src_stride);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+  __m128i vec6, vec7, vec8, vec9, vec10, vec11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt0;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  uint8_t *dst_tmp1 = dst + dst_stride4;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += (src_stride4 + src_stride);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4,
+            vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8,
+            vec9, vec10, vec11);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11,
+            filt0, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+
+  __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+  __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+}
+
+static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7)
+    src8 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+    dst += dst_stride4;
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+    dst += dst_stride4;
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  uint8_t *src_tmp;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+  src += src_stride;
+  src_tmp = src + 16;
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src7, src3, src8);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9);
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    src += src_stride4;
+    src_tmp += src_stride4;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride2);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    dst += dst_stride;
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    dst += dst_stride;
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    dst += dst_stride;
+    __lsx_vst(tmp, dst, 16);
+
+    dst += dst_stride;
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *dst_tmp1 = dst + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+            src9);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    uint8_t *src_tmp0 = src + src_stride;
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+              src10);
+    DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48,
+              src2, src5, src8, src11);
+    src += src_stride2;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 0);
+
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 32);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 32);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 48);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 48);
+    dst += dst_stride2;
+    dst_tmp1 += dst_stride2;
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 32:
+        common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
new file mode 100644
index 0000000000..1dad29eeed
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
@@ -0,0 +1,321 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1;
+  __m128i dst0, dst1;
+
+  int32_t src_stride2 = src_stride << 1;
+
+  if ((height % 2) == 0) {
+    for (cnt = (height / 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      src1 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      dst0 = __lsx_vld(dst, 0);
+      dst1 = __lsx_vldx(dst, dst_stride);
+      DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1);
+
+      __lsx_vstelm_w(dst0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_w(dst1, dst, 0, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 4);
+  __m128i src0, src1, src2, src3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  for (; cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst3, dst, 0, 0);
+    dst += dst_stride;
+  }
+}
+
+static void avg_width16_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 8);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  for (; cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+    src7 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+    dst4 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6);
+    dst7 = __lsx_vldx(dst, dst_stride3);
+    dst -= dst_stride4;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst1, dst, dst_stride);
+    __lsx_vstx(dst2, dst, dst_stride2);
+    __lsx_vstx(dst3, dst, dst_stride3);
+    dst += dst_stride4;
+    __lsx_vst(dst4, dst, 0);
+    __lsx_vstx(dst5, dst, dst_stride);
+    __lsx_vstx(dst6, dst, dst_stride2);
+    __lsx_vstx(dst7, dst, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void avg_width32_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 8);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  for (; cnt--;) {
+    uint8_t *dst_tmp = dst;
+    uint8_t *dst_tmp1 = dst_tmp + 16;
+    uint8_t *src_tmp = src + 16;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1);
+    DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+              dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5);
+    DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6,
+              dst7);
+    dst_tmp += dst_stride4;
+    dst_tmp1 += dst_stride4;
+
+    src_tmp = src + 16;
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src10, src11, src12, src13);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9);
+    DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+              dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13);
+    DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14,
+              dst15);
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+    DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+              dst11, dst8, dst9, dst10, dst11);
+    DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+              dst15, dst12, dst13, dst14, dst15);
+
+    dst_tmp = dst + 16;
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst2, dst, dst_stride);
+    __lsx_vstx(dst4, dst, dst_stride2);
+    __lsx_vstx(dst6, dst, dst_stride3);
+    __lsx_vst(dst1, dst_tmp, 0);
+    __lsx_vstx(dst3, dst_tmp, dst_stride);
+    __lsx_vstx(dst5, dst_tmp, dst_stride2);
+    __lsx_vstx(dst7, dst_tmp, dst_stride3);
+    dst += dst_stride4;
+
+    __lsx_vst(dst8, dst, 0);
+    __lsx_vstx(dst10, dst, dst_stride);
+    __lsx_vstx(dst12, dst, dst_stride2);
+    __lsx_vstx(dst14, dst, dst_stride3);
+    __lsx_vst(dst9, dst_tmp1, 0);
+    __lsx_vstx(dst11, dst_tmp1, dst_stride);
+    __lsx_vstx(dst13, dst_tmp1, dst_stride2);
+    __lsx_vstx(dst15, dst_tmp1, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void avg_width64_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 4);
+  uint8_t *dst_tmp = dst;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (; cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+              src7);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10,
+              src11);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14,
+              src15);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst0, dst1, dst2, dst3);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst4, dst5, dst6, dst7);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst8, dst9, dst10, dst11);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst12, dst13, dst14, dst15);
+    dst_tmp += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+    DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+              dst11, dst8, dst9, dst10, dst11);
+    DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+              dst15, dst12, dst13, dst14, dst15);
+
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    __lsx_vst(dst2, dst, 32);
+    __lsx_vst(dst3, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst4, dst, 0);
+    __lsx_vst(dst5, dst, 16);
+    __lsx_vst(dst6, dst, 32);
+    __lsx_vst(dst7, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst8, dst, 0);
+    __lsx_vst(dst9, dst, 16);
+    __lsx_vst(dst10, dst, 32);
+    __lsx_vst(dst11, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst12, dst, 0);
+    __lsx_vst(dst13, dst, 16);
+    __lsx_vst(dst14, dst, 32);
+    __lsx_vst(dst15, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4,
+                          int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                          int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  switch (w) {
+    case 4: {
+      avg_width4_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+
+    case 8: {
+      avg_width8_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      avg_width16_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      avg_width32_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      avg_width64_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      int32_t lp, cnt;
+      for (cnt = h; cnt--;) {
+        for (lp = 0; lp < w; ++lp) {
+          dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+        }
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
new file mode 100644
index 0000000000..53dc7097ed
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
@@ -0,0 +1,437 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+
+      __lsx_vstelm_d(src4, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src5, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src6, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src7, dst, 0, 0);
+      dst += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    for (cnt = height >> 3; cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+
+      __lsx_vstelm_d(src4, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src5, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src6, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src7, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 4) == 0) {
+    for (cnt = (height / 4); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 2) == 0) {
+    for (cnt = (height / 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      src1 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width) {
+  int32_t cnt, loop_cnt;
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = (uint8_t *)src;
+    dst_tmp = dst;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+      src0 = __lsx_vld(src_tmp, 0);
+      DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp,
+                src_stride3, src_tmp, src_stride4, src1, src2, src3, src4);
+      src_tmp += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src_tmp += src_stride2;
+      src7 = __lsx_vldx(src_tmp, src_stride);
+      src_tmp += src_stride2;
+
+      __lsx_vst(src0, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src1, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src2, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src3, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void copy_width16_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src4, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src5, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src6, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src7, dst, 0);
+      dst += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16);
+  } else if ((height % 4) == 0) {
+    for (cnt = (height >> 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_width32_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32);
+  } else if ((height % 4) == 0) {
+    for (cnt = (height >> 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+  }
+}
+
+static void copy_width64_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                           int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  switch (w) {
+    case 4: {
+      uint32_t cnt;
+      __m128i tmp;
+      for (cnt = h; cnt--;) {
+        tmp = __lsx_vldrepl_w(src, 0);
+        __lsx_vstelm_w(tmp, dst, 0, 0);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+    case 8: {
+      copy_width8_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      copy_width16_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_width32_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_width64_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      uint32_t cnt;
+      for (cnt = h; cnt--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h
new file mode 100644
index 0000000000..d886b00198
--- /dev/null
+++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -0,0 +1,138 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1,
+                                          __m128i _reg2, __m128i _reg3,
+                                          __m128i _filter0, __m128i _filter1,
+                                          __m128i _filter2, __m128i _filter3) {
+  __m128i _vec0, _vec1;
+
+  _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);
+  _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);
+  _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);
+  _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);
+  return __lsx_vsadd_h(_vec0, _vec1);
+}
+
+static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1,
+                                      __m128i _mask0, __m128i _mask1,
+                                      __m128i _mask2, __m128i _mask3,
+                                      __m128i _filt_h0, __m128i _filt_h1,
+                                      __m128i _filt_h2, __m128i _filt_h3) {
+  __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+  __m128i _out;
+
+  DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,
+            _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);
+  _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1,
+                             _filt_h2, _filt_h3);
+  _out = __lsx_vsrari_h(_out, FILTER_BITS);
+  return __lsx_vsat_h(_out, 7);
+}
+
+static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask,
+                                         __m128i coeff) {
+  __m128i tmp0_m, tmp1_m;
+
+  tmp0_m = __lsx_vshuf_b(in1, in0, mask);
+  tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);
+  return __lsx_vsrari_h(tmp1_m, FILTER_BITS);
+}
+
+#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
+  do {                                                      \
+    _src0 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src1 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src2 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src3 = __lsx_vld(_src, 0);                             \
+  } while (0)
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \
+                                   _mask2, _mask3, _filter0, _filter1,         \
+                                   _filter2, _filter3, _out0, _out1)           \
+  do {                                                                         \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
+    __m128i _reg0, _reg1, _reg2, _reg3;                                        \
+                                                                               \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0,       \
+              _tmp0, _tmp1);                                                   \
+    DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1,       \
+              _tmp2, _tmp3);                                                   \
+    DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3,         \
+              _filter1, _reg0, _reg1);                                         \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2,       \
+              _tmp4, _tmp5);                                                   \
+    DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3,       \
+              _tmp6, _tmp7);                                                   \
+    DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7,         \
+              _filter3, _reg2, _reg3);                                         \
+    DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1);        \
+  } while (0)
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(                                            \
+    _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0,      \
+    _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3)                  \
+  do {                                                                         \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
+    __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7;            \
+                                                                               \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0,       \
+              _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \
+              _tmp3);                                                          \
+    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2,         \
+              _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3);          \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2,       \
+              _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \
+              _tmp3);                                                          \
+    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2,         \
+              _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7);          \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1,       \
+              _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \
+              _tmp7);                                                          \
+    DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5,         \
+              _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
+              _reg1, _reg2, _reg3);                                            \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3,       \
+              _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \
+              _tmp7);                                                          \
+    DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5,         \
+              _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
+              _reg5, _reg6, _reg7);                                            \
+    DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3,  \
+              _reg7, _out0, _out1, _out2, _out3);                              \
+  } while (0)
+
+#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride)                \
+  do {                                                               \
+    __m128i tmp0_m, tmp1_m;                                          \
+                                                                     \
+    DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 0);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 1);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 0);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 1);                              \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 9956028317..d6504aab1f 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -159,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
                            uint8_t *oq2, uint8_t *oq3) {
@@ -232,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
-                            uint8_t flat2, uint8_t *op7, uint8_t *op6,
+static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
+                            int8_t flat2, uint8_t *op7, uint8_t *op6,
                             uint8_t *op5, uint8_t *op4, uint8_t *op3,
                             uint8_t *op2, uint8_t *op1, uint8_t *op0,
                             uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
@@ -505,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
                                   uint16_t *oq2, uint16_t *oq3, int bd) {
@@ -584,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
-                                   uint8_t flat2, uint16_t *op7, uint16_t *op6,
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
+                                   int8_t flat2, uint16_t *op7, uint16_t *op6,
                                    uint16_t *op5, uint16_t *op4, uint16_t *op3,
                                    uint16_t *op2, uint16_t *op1, uint16_t *op0,
                                    uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index a3a5a4dfee..53462b59f4 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -83,31 +83,33 @@
     val_lh_m;                                                    \
   })
 
-#define LW(psrc)                                                 \
-  ({                                                             \
-    const uint8_t *psrc_lw_m = (const uint8_t *)(psrc);          \
-    uint32_t val_lw_m;                                           \
-                                                                 \
-    __asm__ __volatile__("ulw  %[val_lw_m],  %[psrc_lw_m]  \n\t" \
-                                                                 \
-                         : [val_lw_m] "=r"(val_lw_m)             \
-                         : [psrc_lw_m] "m"(*psrc_lw_m));         \
-                                                                 \
-    val_lw_m;                                                    \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
+    uint32_t val_lw_m;                                  \
+                                                        \
+    __asm__ __volatile__(                               \
+        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t"         \
+        "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t"         \
+        : [val_lw_m] "=&r"(val_lw_m)                    \
+        : [psrc_lw_m] "r"(psrc_lw_m));                  \
+                                                        \
+    val_lw_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                                 \
-  ({                                                             \
-    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);          \
-    uint64_t val_ld_m = 0;                                       \
-                                                                 \
-    __asm__ __volatile__("uld  %[val_ld_m],  %[psrc_ld_m]  \n\t" \
-                                                                 \
-                         : [val_ld_m] "=r"(val_ld_m)             \
-                         : [psrc_ld_m] "m"(*psrc_ld_m));         \
-                                                                 \
-    val_ld_m;                                                    \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+    uint64_t val_ld_m = 0;                              \
+                                                        \
+    __asm__ __volatile__(                               \
+        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t"         \
+        "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t"         \
+        : [val_ld_m] "=&r"(val_ld_m)                    \
+        : [psrc_ld_m] "r"(psrc_ld_m));                  \
+                                                        \
+    val_ld_m;                                           \
   })
 #else  // !(__mips == 64)
 #define LD(psrc)                                                  \
@@ -772,16 +774,16 @@
    Details     : 4 signed word elements of 'in' vector are added together and
                  the resulting integer sum is returned
 */
-#define HADD_SW_S32(in)                            \
-  ({                                               \
-    v2i64 res0_m, res1_m;                          \
-    int32_t sum_m;                                 \
-                                                   \
-    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
-    res1_m = __msa_splati_d(res0_m, 1);            \
-    res0_m = res0_m + res1_m;                      \
-    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
-    sum_m;                                         \
+#define HADD_SW_S32(in)                                               \
+  ({                                                                  \
+    v2i64 hadd_sw_s32_res0_m, hadd_sw_s32_res1_m;                     \
+    int32_t hadd_sw_s32_sum_m;                                        \
+                                                                      \
+    hadd_sw_s32_res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);        \
+    hadd_sw_s32_res1_m = __msa_splati_d(hadd_sw_s32_res0_m, 1);       \
+    hadd_sw_s32_res0_m = hadd_sw_s32_res0_m + hadd_sw_s32_res1_m;     \
+    hadd_sw_s32_sum_m = __msa_copy_s_w((v4i32)hadd_sw_s32_res0_m, 0); \
+    hadd_sw_s32_sum_m;                                                \
   })
 
 /* Description : Horizontal addition of 4 unsigned word elements
@@ -791,16 +793,16 @@
    Details     : 4 unsigned word elements of 'in' vector are added together and
                  the resulting integer sum is returned
 */
-#define HADD_UW_U32(in)                               \
-  ({                                                  \
-    v2u64 res0_m, res1_m;                             \
-    uint32_t sum_m;                                   \
-                                                      \
-    res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
-    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
-    res0_m += res1_m;                                 \
-    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
-    sum_m;                                            \
+#define HADD_UW_U32(in)                                                       \
+  ({                                                                          \
+    v2u64 hadd_uw_u32_res0_m, hadd_uw_u32_res1_m;                             \
+    uint32_t hadd_uw_u32_sum_m;                                               \
+                                                                              \
+    hadd_uw_u32_res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);                \
+    hadd_uw_u32_res1_m = (v2u64)__msa_splati_d((v2i64)hadd_uw_u32_res0_m, 1); \
+    hadd_uw_u32_res0_m += hadd_uw_u32_res1_m;                                 \
+    hadd_uw_u32_sum_m = __msa_copy_u_w((v4i32)hadd_uw_u32_res0_m, 0);         \
+    hadd_uw_u32_sum_m;                                                        \
   })
 
 /* Description : Horizontal addition of 8 unsigned halfword elements
@@ -810,14 +812,14 @@
    Details     : 8 unsigned halfword elements of 'in' vector are added
                  together and the resulting integer sum is returned
 */
-#define HADD_UH_U32(in)                           \
-  ({                                              \
-    v4u32 res_m;                                  \
-    uint32_t sum_m;                               \
-                                                  \
-    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
-    sum_m = HADD_UW_U32(res_m);                   \
-    sum_m;                                        \
+#define HADD_UH_U32(in)                                       \
+  ({                                                          \
+    v4u32 hadd_uh_u32_res_m;                                  \
+    uint32_t hadd_uh_u32_sum_m;                               \
+                                                              \
+    hadd_uh_u32_res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+    hadd_uh_u32_sum_m = HADD_UW_U32(hadd_uh_u32_res_m);       \
+    hadd_uh_u32_sum_m;                                        \
   })
 
 /* Description : Horizontal addition of unsigned byte vector elements
diff --git a/vpx_dsp/mips/sad_mmi.c b/vpx_dsp/mips/sad_mmi.c
index 4368db5fdb..7f5882bca3 100644
--- a/vpx_dsp/mips/sad_mmi.c
+++ b/vpx_dsp/mips/sad_mmi.c
@@ -334,19 +334,6 @@
   "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
 #endif /* _MIPS_SIM == _ABIO32 */
 
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK_mmi(m, n, k)                                                 \
-  void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride,       \
-                                    const uint8_t *ref_array, int ref_stride, \
-                                    uint32_t *sad_array) {                    \
-    int i;                                                                    \
-    for (i = 0; i < (k); ++i)                                                 \
-      sad_array[i] =                                                          \
-          vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
-  }
-
-// This appears to be equivalent to the above when k == 4 and refs is const
 #define sadMxNx4D_mmi(m, n)                                                  \
   void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride,         \
                                  const uint8_t *const ref_array[],           \
@@ -364,8 +351,9 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_64
@@ -383,6 +371,7 @@ static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -405,9 +394,11 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_64
@@ -424,11 +415,12 @@ static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -450,8 +442,9 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_32
@@ -469,6 +462,7 @@ static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -493,9 +487,11 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_32
@@ -512,11 +508,12 @@ static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -539,8 +536,9 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_16
@@ -558,6 +556,7 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -571,10 +570,6 @@ static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
 vpx_sad16xN(32);
 vpx_sad16xN(16);
 vpx_sad16xN(8);
-sadMxNxK_mmi(16, 16, 3);
-sadMxNxK_mmi(16, 16, 8);
-sadMxNxK_mmi(16, 8, 3);
-sadMxNxK_mmi(16, 8, 8);
 sadMxNx4D_mmi(16, 32);
 sadMxNx4D_mmi(16, 16);
 sadMxNx4D_mmi(16, 8);
@@ -586,9 +581,11 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_16
@@ -605,11 +602,12 @@ static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
       [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -632,8 +630,9 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_8
@@ -651,6 +650,7 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -664,10 +664,6 @@ static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
 vpx_sad8xN(16);
 vpx_sad8xN(8);
 vpx_sad8xN(4);
-sadMxNxK_mmi(8, 16, 3);
-sadMxNxK_mmi(8, 16, 8);
-sadMxNxK_mmi(8, 8, 3);
-sadMxNxK_mmi(8, 8, 8);
 sadMxNx4D_mmi(8, 16);
 sadMxNx4D_mmi(8, 8);
 sadMxNx4D_mmi(8, 4);
@@ -679,9 +675,11 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_8
@@ -697,11 +695,12 @@ static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
     "mfc1       %[sad],     %[ftmp3]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -724,8 +723,9 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_REF_ABS_SUB_4
@@ -743,6 +743,7 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
@@ -755,8 +756,6 @@ static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
 
 vpx_sad4xN(8);
 vpx_sad4xN(4);
-sadMxNxK_mmi(4, 4, 3);
-sadMxNxK_mmi(4, 4, 8);
 sadMxNx4D_mmi(4, 8);
 sadMxNx4D_mmi(4, 4);
 
@@ -767,9 +766,11 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
   unsigned int sad;
   double ftmp1, ftmp2, ftmp3;
   mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
     "1:                                                         \n\t"
     // Include two loop body, to reduce loop time.
     SAD_SRC_AVGREF_ABS_SUB_4
@@ -785,11 +786,12 @@ static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
     "mfc1       %[sad],     %[ftmp3]                            \n\t"
     : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
       [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
-      [second_pred]"+&r"((mips_reg)second_pred),
+      [second_pred]"+&r"(l_second_pred),
       [sad]"=&r"(sad)
     : [src_stride]"r"((mips_reg)src_stride),
       [ref_stride]"r"((mips_reg)ref_stride)
   );
+  /* clang-format on */
 
   return sad;
 }
diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c
index ab681ae9f8..b0f8ff1fd9 100644
--- a/vpx_dsp/mips/sad_msa.c
+++ b/vpx_dsp/mips/sad_msa.c
@@ -159,380 +159,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
   return sad;
 }
 
-static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref, ref0, ref1, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref0, ref1, ref;
-  v16u8 diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t *const aref_ptr[],
                                int32_t ref_stride, int32_t height,
@@ -1037,80 +663,38 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
   }
 
-#define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
@@ -1186,29 +770,21 @@ VPX_AVGSAD_16xHEIGHT_MSA(32);
 
 // 16x16
 VPX_SAD_16xHEIGHT_MSA(16);
-VPX_SAD_16xHEIGHTx3_MSA(16);
-VPX_SAD_16xHEIGHTx8_MSA(16);
 VPX_SAD_16xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_16xHEIGHT_MSA(16);
 
 // 16x8
 VPX_SAD_16xHEIGHT_MSA(8);
-VPX_SAD_16xHEIGHTx3_MSA(8);
-VPX_SAD_16xHEIGHTx8_MSA(8);
 VPX_SAD_16xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_16xHEIGHT_MSA(8);
 
 // 8x16
 VPX_SAD_8xHEIGHT_MSA(16);
-VPX_SAD_8xHEIGHTx3_MSA(16);
-VPX_SAD_8xHEIGHTx8_MSA(16);
 VPX_SAD_8xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_8xHEIGHT_MSA(16);
 
 // 8x8
 VPX_SAD_8xHEIGHT_MSA(8);
-VPX_SAD_8xHEIGHTx3_MSA(8);
-VPX_SAD_8xHEIGHTx8_MSA(8);
 VPX_SAD_8xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_8xHEIGHT_MSA(8);
 
@@ -1224,7 +800,5 @@ VPX_AVGSAD_4xHEIGHT_MSA(8);
 
 // 4x4
 VPX_SAD_4xHEIGHT_MSA(4);
-VPX_SAD_4xHEIGHTx3_MSA(4);
-VPX_SAD_4xHEIGHTx8_MSA(4);
 VPX_SAD_4xHEIGHTx4D_MSA(4);
 VPX_AVGSAD_4xHEIGHT_MSA(4);
diff --git a/vpx_dsp/mips/subtract_mmi.c b/vpx_dsp/mips/subtract_mmi.c
index 9f361704a8..8bd7e6977c 100644
--- a/vpx_dsp/mips/subtract_mmi.c
+++ b/vpx_dsp/mips/subtract_mmi.c
@@ -24,7 +24,7 @@ void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
     switch (rows) {
       case 4:
         __asm__ volatile(
-            "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+            "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
 #if _MIPS_SIM == _ABIO32
             "ulw        %[tmp0],    0x00(%[src])                        \n\t"
             "mtc1       %[tmp0],    %[ftmp1]                            \n\t"
@@ -118,7 +118,7 @@ void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
         break;
       case 8:
         __asm__ volatile(
-            "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+            "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
             "li         %[tmp0],    0x02                                \n\t"
             "1:                                                         \n\t"
             "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
@@ -206,7 +206,7 @@ void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
         break;
       case 16:
         __asm__ volatile(
-            "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+            "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
             "li         %[tmp0],    0x08                                \n\t"
             "1:                                                         \n\t"
             "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
diff --git a/vpx_dsp/mips/variance_mmi.c b/vpx_dsp/mips/variance_mmi.c
index c1780c33af..c2adcfa018 100644
--- a/vpx_dsp/mips/variance_mmi.c
+++ b/vpx_dsp/mips/variance_mmi.c
@@ -150,7 +150,7 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t" \
                                                                     \
   /* store: temp2[0] ~ temp2[3] */                                  \
-  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "pand       %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
   "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t" \
   "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
 
@@ -163,7 +163,7 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
                                                                     \
   /* store: temp2[0] ~ temp2[3] */                                  \
-  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "pand       %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
   "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t" \
   "gssdrc1    %[ftmp4],   0x00(%[temp2_ptr])                  \n\t"
 
@@ -225,8 +225,8 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t" \
                                                                     \
   /* store: temp2[0] ~ temp2[7] */                                  \
-  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
-  "and        %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \
+  "pand       %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "pand       %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \
   "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
   "gssdlc1    %[ftmp2],   0x07(%[temp2_ptr])                  \n\t" \
   "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
@@ -247,8 +247,8 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t" \
                                                                     \
   /* store: temp2[0] ~ temp2[7] */                                  \
-  "and        %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \
-  "and        %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \
+  "pand       %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \
+  "pand       %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \
   "packushb   %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t" \
   "gssdlc1    %[ftmp8],   0x07(%[temp2_ptr])                  \n\t" \
   "gssdrc1    %[ftmp8],   0x00(%[temp2_ptr])                  \n\t"
@@ -319,8 +319,8 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t" \
                                                                     \
   /* store: temp2[8] ~ temp2[15] */                                 \
-  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
-  "and        %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \
+  "pand       %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "pand       %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \
   "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
   "gssdlc1    %[ftmp4],   0x0f(%[temp2_ptr])                  \n\t" \
   "gssdrc1    %[ftmp4],   0x08(%[temp2_ptr])                  \n\t"
@@ -343,8 +343,8 @@ static const uint8_t bilinear_filters[8][2] = {
   "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t" \
                                                                     \
   /* store: temp2[8] ~ temp2[15] */                                 \
-  "and        %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \
-  "and        %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \
+  "pand       %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \
+  "pand       %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \
   "packushb   %[ftmp10],  %[ftmp10],      %[ftmp11]           \n\t" \
   "gssdlc1    %[ftmp10],  0x0f(%[temp2_ptr])                  \n\t" \
   "gssdrc1    %[ftmp10],  0x08(%[temp2_ptr])                  \n\t"
@@ -414,13 +414,14 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -478,7 +479,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
     "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
     "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
-    "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "ssrld      %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
     "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
     "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -496,6 +497,7 @@ static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (64 * high));
 }
@@ -519,13 +521,14 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     "li         %[tmp0],    0x40                                \n\t"
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -559,7 +562,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
     "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
     "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
-    "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "ssrld      %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
     "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
     "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -577,6 +580,7 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
       [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / 2048);
 }
@@ -590,14 +594,15 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
-    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -625,7 +630,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
 
@@ -636,7 +641,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
-    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
 
@@ -653,6 +658,7 @@ static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (32 * high));
 }
@@ -676,14 +682,15 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
-    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -701,7 +708,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
 
@@ -712,7 +719,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
-    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
 
@@ -729,6 +736,7 @@ static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (16 * high));
 }
@@ -753,14 +761,15 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
-    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
-    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -773,7 +782,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
 
@@ -784,7 +793,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
-    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
 
@@ -801,6 +810,7 @@ static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (8 * high));
 }
@@ -825,14 +835,15 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
-    "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
     "1:                                                         \n\t"
     "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
     "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
@@ -845,7 +856,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
 
@@ -856,7 +867,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
     "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
     "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
-    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp10]           \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp10]           \n\t"
     "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
     "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -872,6 +883,7 @@ static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse - (((int64_t)sum * sum) / (4 * high));
 }
@@ -894,12 +906,13 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
 
     "1:                                                         \n\t"
     VARIANCE_SSE_16
@@ -909,7 +922,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -925,6 +938,7 @@ static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse;
 }
@@ -947,12 +961,13 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
 
   *sse = 0;
 
+  /* clang-format off */
   __asm__ volatile (
     "li         %[tmp0],    0x20                                \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
     MMI_L(%[tmp0], %[high], 0x00)
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
 
     "1:                                                         \n\t"
     VARIANCE_SSE_8
@@ -962,7 +977,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
     MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
-    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
     "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
     "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
     : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
@@ -978,6 +993,7 @@ static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
+  /* clang-format on */
 
   return *sse;
 }
@@ -1021,22 +1037,39 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
   uint8_t *temp2_ptr = temp2;
   mips_reg l_counter = counter;
   double ftmp[15];
+  double ff_ph_40, mask;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
   mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  uint64_t x0, x1, y0, y1, all;
 
   const uint8_t *filter_x = bilinear_filters[x_offset];
   const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
     MMI_LI(%[tmp0], 0x07)
     MMI_MTC1(%[tmp0], %[ftmp14])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
-
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
     // fdata3: fdata3[0] ~ fdata3[15]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
 
@@ -1072,15 +1105,13 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
-      [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
     : "memory"
   );
+  /* clang-format on */
 }
 
 #define SUBPIX_VAR16XN(H)                                                      \
@@ -1105,19 +1136,38 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
   mips_reg l_counter = counter;
   double ftmp[15];
   mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
   const uint8_t *filter_x = bilinear_filters[x_offset];
   const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
     MMI_LI(%[tmp0], 0x07)
     MMI_MTC1(%[tmp0], %[ftmp14])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
 
     // fdata3: fdata3[0] ~ fdata3[7]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
@@ -1154,15 +1204,13 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
       [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
-      [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
     : "memory"
   );
+  /* clang-format on */
 }
 
 #define SUBPIX_VAR8XN(H)                                                      \
@@ -1188,19 +1236,38 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
   mips_reg l_counter = counter;
   double ftmp[7];
   mips_reg tmp[2];
-  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
-  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
   const uint8_t *filter_x = bilinear_filters[x_offset];
   const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
 
+  /* clang-format off */
   __asm__ volatile (
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp6])
+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp6],     %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp6],     %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
     MMI_LI(%[tmp0], 0x07)
     MMI_MTC1(%[tmp0], %[ftmp6])
-    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
-    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
     // fdata3: fdata3[0] ~ fdata3[3]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
 
@@ -1232,15 +1299,14 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
       [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
-      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
-    : [filter_x0] "f"((uint64_t)filter_x[0]),
-      [filter_x1] "f"((uint64_t)filter_x[1]),
-      [filter_y0] "f"((uint64_t)filter_y[0]),
-      [filter_y1] "f"((uint64_t)filter_y[1]),
-      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
-      [mask] "f"(mask)
+      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
+      [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
     : "memory"
   );
+  /* clang-format on */
 }
 
 #define SUBPIX_VAR4XN(H)                                                      \
diff --git a/vpx_dsp/mips/vpx_convolve8_mmi.c b/vpx_dsp/mips/vpx_convolve8_mmi.c
index ba9ceb8665..cb7bca5589 100644
--- a/vpx_dsp/mips/vpx_convolve8_mmi.c
+++ b/vpx_dsp/mips/vpx_convolve8_mmi.c
@@ -105,7 +105,7 @@ static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
   /* clang-format off */
   __asm__ volatile(
     "move       %[tmp1],    %[width]                   \n\t"
-    "xor        %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
     "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
     "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
     "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
@@ -178,7 +178,7 @@ static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   __asm__ volatile(
-    "xor        %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+    "pxor       %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
     "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
     "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
     "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
@@ -271,7 +271,7 @@ static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
 
   __asm__ volatile(
     "move       %[tmp1],    %[width]                   \n\t"
-    "xor        %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
     "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
     "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
     "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
@@ -354,7 +354,7 @@ static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
 
   __asm__ volatile(
-    "xor        %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+    "pxor       %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
     "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
     "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
     "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
@@ -467,7 +467,7 @@ void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
 
     __asm__ volatile(
       "move       %[tmp1],    %[width]                  \n\t"
-      "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
+      "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
       "li         %[tmp0],    0x10001                   \n\t"
       MMI_MTC1(%[tmp0],    %[ftmp3])
       "punpcklhw  %[ftmp3],   %[ftmp3],   %[ftmp3]      \n\t"
diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c
index d85e63bd14..ab71f6e235 100644
--- a/vpx_dsp/ppc/quantize_vsx.c
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -78,11 +78,10 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
   return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
 }
 
-static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff,
                                           const int16_t *iscan_ptr, int index) {
   int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
   bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
-  scan = vec_sub(scan, mask);
   return vec_andc(scan, zero_coeff);
 }
 
@@ -95,8 +94,8 @@ static INLINE int16x8_t vec_max_across(int16x8_t a) {
 }
 
 void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *zbin_ptr,
-                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                         uint16_t *eob_ptr, const int16_t *scan_ptr,
@@ -122,8 +121,6 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   zero_mask1 = vec_cmpge(coeff1_abs, zbin);
 
   (void)scan_ptr;
-  (void)skip_block;
-  assert(!skip_block);
 
   qcoeff0 =
       quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
@@ -141,8 +138,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
   vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
 
-  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
-                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+  eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, iscan_ptr, 16));
 
   if (n_coeffs > 16) {
     int index = 16;
@@ -179,10 +176,9 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
       vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
 
-      eob =
-          vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
-      eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
-                     nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+      eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+      eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+                     nonzero_scanindex(qcoeff2, iscan_ptr, off2));
       eob = vec_max(eob, eob2);
 
       index += 24;
@@ -196,12 +192,14 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = eob[0];
 }
 
-void vpx_quantize_b_32x32_vsx(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan_ptr,
+                              const int16_t *iscan_ptr) {
   // In stage 1, we quantize 16 coeffs (DC + 15 AC)
   // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
   // (32 * 32 - 16) / 24 = 42
@@ -227,9 +225,7 @@ void vpx_quantize_b_32x32_vsx(
   int16x8_t coeff1_abs = vec_abs(coeff1);
 
   (void)scan_ptr;
-  (void)skip_block;
   (void)n_coeffs;
-  assert(!skip_block);
 
   // 32x32 quantization requires that zbin and round be divided by 2
   zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16);
@@ -254,8 +250,8 @@ void vpx_quantize_b_32x32_vsx(
   dequant = vec_splat(dequant, 1);  // remove DC from dequant
   vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr);
 
-  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
-                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+  eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, iscan_ptr, 16));
 
   do {
     int16x8_t coeff2, coeff2_abs, qcoeff2, eob2;
@@ -288,9 +284,9 @@ void vpx_quantize_b_32x32_vsx(
     vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr);
     vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr);
 
-    eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
-    eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
-                   nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+    eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+    eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+                   nonzero_scanindex(qcoeff2, iscan_ptr, off2));
     eob = vec_max(eob, eob2);
 
     // 24 int16_t is 48 bytes
diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c
index be9614a358..6c6bc9a301 100644
--- a/vpx_dsp/ppc/variance_vsx.c
+++ b/vpx_dsp/ppc/variance_vsx.c
@@ -225,7 +225,7 @@ static INLINE void variance(const uint8_t *src_ptr, int src_stride,
   }
 
 /* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * sse - sum^2 / w*h and returns sse in addition to modifying the passed in
  * variable.
  */
 #define MSE(W, H)                                                         \
diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
index 48bac04508..4ee4130a21 100644
--- a/vpx_dsp/psnr.c
+++ b/vpx_dsp/psnr.c
@@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
  * and highbd_8_variance(). It should not.
  */
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, unsigned int *sse,
-                             int *sum) {
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int w, int h) {
   int i, j;
-
-  *sum = 0;
-  *sse = 0;
+  int64_t sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
 
     a += a_stride;
     b += b_stride;
   }
+
+  return sse;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, uint64_t *sse, int64_t *sum) {
+static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride,
+                                  const uint8_t *b8, int b_stride, int w,
+                                  int h) {
   int i, j;
+  int64_t sse = 0;
 
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
     a += a_stride;
     b += b_stride;
   }
-}
 
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
-                            &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
+  return sse;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -85,29 +72,24 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   const int dw = width % 16;
   const int dh = height % 16;
   int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
   int x, y;
 
   if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
-                     height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                             dw, height);
   }
 
   if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
-                     &sse, &sum);
-    total_sse += sse;
+    total_sse +=
+        encoder_sse(&a[(height - dh) * a_stride], a_stride,
+                    &b[(height - dh) * b_stride], b_stride, width - dw, dh);
   }
 
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
     for (x = 0; x < width / 16; ++x) {
-      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
+      total_sse += vpx_sse(pa, a_stride, pb, b_stride, 16, 16);
 
       pa += 16;
       pb += 16;
@@ -146,25 +128,20 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   int x, y;
   const int dw = width % 16;
   const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
   if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
-                              b_stride, dw, height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw],
+                                    b_stride, dw, height);
   }
   if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride,
+                                    &b[(height - dh) * b_stride], b_stride,
+                                    width - dw, dh);
   }
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
     for (x = 0; x < width / 16; ++x) {
-      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
+      total_sse += vpx_highbd_sse(pa, a_stride, pb, b_stride, 16, 16);
       pa += 16;
       pb += 16;
     }
diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h
index 9ebb64dd52..7c57aa429f 100644
--- a/vpx_dsp/psnr.h
+++ b/vpx_dsp/psnr.h
@@ -26,7 +26,7 @@ typedef struct vpx_psnr_pkt PSNR_STATS;
 
 /*!\brief Converts SSE to PSNR
  *
- * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR).
  *
  * \param[in]    samples       Number of samples
  * \param[in]    peak          Max sample value
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 61818f692e..fac9136f8c 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,8 +14,10 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant, uint16_t *eob_ptr) {
@@ -28,28 +30,26 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
-    if (tmp) eob = 0;
-  }
+  tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+  tmp = (tmp * quant) >> 16;
+  qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+  dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+  if (tmp) eob = 0;
+
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
-                            uint16_t *eob_ptr) {
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant, uint16_t *eob_ptr) {
   int eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  {
     const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
@@ -59,11 +59,12 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
     if (abs_qcoeff) eob = 0;
   }
+
   *eob_ptr = eob + 1;
 }
 #endif
 
-void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant, uint16_t *eob_ptr) {
@@ -77,19 +78,18 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
-    if (tmp) eob = 0;
-  }
+  tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), INT16_MIN,
+              INT16_MAX);
+  tmp = (tmp * quant) >> 15;
+  qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+  dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
+  if (tmp) eob = 0;
+
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
@@ -100,7 +100,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  {
     const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
@@ -110,23 +110,23 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;
     if (abs_qcoeff) eob = 0;
   }
+
   *eob_ptr = eob + 1;
 }
 #endif
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *zbin_ptr,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
-                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
+                      const struct macroblock_plane *const mb_plane,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const struct ScanOrder *const scan_order) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -166,18 +166,17 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
+                             const struct macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+                             const struct ScanOrder *const scan_order) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -214,23 +213,23 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 #endif
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                            const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+                            const struct ScanOrder *const scan_order) {
+  const int n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[32 * 32 /* n_coeffs */];
   int i, eob = -1;
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -277,21 +276,21 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+  const intptr_t n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
-  (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 7cac140e9d..8e138445e2 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -18,22 +18,21 @@
 extern "C" {
 #endif
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant, uint16_t *eob_ptr);
-void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant, uint16_t *eob_ptr);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
-                            uint16_t *eob_ptr);
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant, uint16_t *eob_ptr);
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 769322019e..2a4c81d588 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -40,32 +40,35 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
   unsigned int vpx_sad##m##x##n##_avg_c(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]);                           \
     vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
     return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
+  }                                                                           \
+  unsigned int vpx_sad_skip_##m##x##n##_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m),     \
+                   (n / 2));                                                  \
   }
 
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK(m, n, k)                                                     \
-  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride,     \
-                                  const uint8_t *ref_ptr, int ref_stride,     \
-                                  uint32_t *sad_array) {                      \
-    int i;                                                                    \
-    for (i = 0; i < k; ++i)                                                   \
-      sad_array[i] =                                                          \
-          vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \
-  }
-
-// This appears to be equivalent to the above when k == 4 and refs is const
+// Compare |src_ptr| to 4 distinct references in |ref_array[4]|
 #define sadMxNx4D(m, n)                                                        \
   void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,         \
-                               const uint8_t *const ref_array[],               \
-                               int ref_stride, uint32_t *sad_array) {          \
+                               const uint8_t *const ref_array[4],              \
+                               int ref_stride, uint32_t sad_array[4]) {        \
     int i;                                                                     \
     for (i = 0; i < 4; ++i)                                                    \
       sad_array[i] =                                                           \
           vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
+  }                                                                            \
+  void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,   \
+                                     const uint8_t *const ref_array[4],        \
+                                     int ref_stride, uint32_t sad_array[4]) {  \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i],            \
+                             2 * ref_stride, (m), (n / 2));                    \
+    }                                                                          \
   }
 
 /* clang-format off */
@@ -83,7 +86,6 @@ sadMxNx4D(32, 64)
 
 // 32x32
 sadMxN(32, 32)
-sadMxNxK(32, 32, 8)
 sadMxNx4D(32, 32)
 
 // 32x16
@@ -96,26 +98,18 @@ sadMxNx4D(16, 32)
 
 // 16x16
 sadMxN(16, 16)
-sadMxNxK(16, 16, 3)
-sadMxNxK(16, 16, 8)
 sadMxNx4D(16, 16)
 
 // 16x8
 sadMxN(16, 8)
-sadMxNxK(16, 8, 3)
-sadMxNxK(16, 8, 8)
 sadMxNx4D(16, 8)
 
 // 8x16
 sadMxN(8, 16)
-sadMxNxK(8, 16, 3)
-sadMxNxK(8, 16, 8)
 sadMxNx4D(8, 16)
 
 // 8x8
 sadMxN(8, 8)
-sadMxNxK(8, 8, 3)
-sadMxNxK(8, 8, 8)
 sadMxNx4D(8, 8)
 
 // 8x4
@@ -128,8 +122,6 @@ sadMxNx4D(4, 8)
 
 // 4x4
 sadMxN(4, 4)
-sadMxNxK(4, 4, 3)
-sadMxNxK(4, 4, 8)
 sadMxNx4D(4, 4)
 /* clang-format on */
 
@@ -179,17 +171,32 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
     vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
                                n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride);   \
     return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
+  }                                                                            \
+  unsigned int vpx_highbd_sad_skip_##m##x##n##_c(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
   }
 
-#define highbd_sadMxNx4D(m, n)                                                \
-  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
-                                      const uint8_t *const ref_array[],       \
-                                      int ref_stride, uint32_t *sad_array) {  \
-    int i;                                                                    \
-    for (i = 0; i < 4; ++i) {                                                 \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,         \
-                                                 ref_array[i], ref_stride);   \
-    }                                                                         \
+#define highbd_sadMxNx4D(m, n)                                                 \
+  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,  \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,          \
+                                                 ref_array[i], ref_stride);    \
+    }                                                                          \
+  }                                                                            \
+  void vpx_highbd_sad_skip_##m##x##n##x4d_c(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],   \
+      int ref_stride, uint32_t sad_array[4]) {                                 \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c(                        \
+          src, src_stride, ref_array[i], ref_stride);                          \
+    }                                                                          \
   }
 
 /* clang-format off */
diff --git a/vpx_dsp/sse.c b/vpx_dsp/sse.c
new file mode 100644
index 0000000000..c9d751859d
--- /dev/null
+++ b/vpx_dsp/sse.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Sum the square of the difference between every corresponding element of the
+ * buffers.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                  int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = abs(a[x] - b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                         int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+#endif
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 30b55dcb40..1c476542fa 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -156,7 +156,7 @@ static void var_filter_block2d_bil_second_pass(
       const uint8_t *second_pred) {                                          \
     uint16_t fdata3[(H + 1) * W];                                            \
     uint8_t temp2[H * W];                                                    \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                              \
+    DECLARE_ALIGNED(32, uint8_t, temp3[H * W]);                              \
                                                                              \
     var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
                                       W, bilinear_filters[x_offset]);        \
@@ -180,7 +180,7 @@ static void var_filter_block2d_bil_second_pass(
   }
 
 /* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * sse - sum^2 / w*h and returns sse in addition to modifying the passed in
  * variable.
  */
 #define MSE(W, H)                                                        \
@@ -549,9 +549,9 @@ HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred,
-                              int width, int height, const uint16_t *ref,
-                              int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred,
+                                int width, int height, const uint16_t *ref,
+                                int ref_stride) {
   int i, j;
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index f8b44f03d1..ccdb2f90ba 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -59,8 +59,6 @@ typedef struct variance_vtable {
   vpx_sad_fn_t sdf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
-  vpx_sad_multi_fn_t sdx3f;
-  vpx_sad_multi_fn_t sdx8f;
   vpx_sad_multi_d_fn_t sdx4df;
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
   vp8_copy32xn_fn_t copymem;
@@ -71,12 +69,15 @@ typedef struct variance_vtable {
 #if CONFIG_VP9
 typedef struct vp9_variance_vtable {
   vpx_sad_fn_t sdf;
+  // Same as normal sad, but downsample the rows by a factor of 2.
+  vpx_sad_fn_t sdsf;
   vpx_sad_avg_fn_t sdaf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
   vpx_sad_multi_d_fn_t sdx4df;
-  vpx_sad_multi_fn_t sdx8f;
+  // Same as sadx4, but downsample the rows by a factor of 2.
+  vpx_sad_multi_d_fn_t sdsx4df;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 01653102ef..93abf39ff6 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -31,10 +31,15 @@ DSP_SRCS-yes += bitwriter_buffer.c
 DSP_SRCS-yes += bitwriter_buffer.h
 DSP_SRCS-yes += psnr.c
 DSP_SRCS-yes += psnr.h
+DSP_SRCS-yes += sse.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
+DSP_SRCS-$(HAVE_NEON) += arm/sse_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sse_neon_dotprod.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sse_sse4.c
+DSP_SRCS-$(HAVE_AVX2) += x86/sse_avx2.c
 endif
 
 ifeq ($(CONFIG_DECODERS),yes)
@@ -74,6 +79,7 @@ endif # CONFIG_POSTPROC
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
 DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
 DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/intrapred_lsx.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
@@ -132,6 +138,10 @@ DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
 DSP_SRCS-yes += arm/vpx_convolve8_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c
+DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c
+DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 
@@ -162,6 +172,17 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
 
 DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
 
+# common (lsx)
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_vert_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_copy_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h
+
 # loop filters
 DSP_SRCS-yes += loopfilter.c
 
@@ -188,6 +209,11 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_16_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_8_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_4_lsx.c
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
@@ -197,6 +223,7 @@ endif # CONFIG_VP9
 DSP_SRCS-yes            += txfm_common.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
 DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/txfm_macros_lsx.h
 # forward transform
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += fwd_txfm.c
@@ -208,18 +235,21 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
 ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
-DSP_SRCS-$(HAVE_NEON)   += arm/fdct_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct4x4_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct8x8_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct16x16_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct32x32_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct_partial_neon.c
-DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.c
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_dct32x32_lsx.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 DSP_SRCS-$(HAVE_VSX)    += ppc/fdct32x32_vsx.c
@@ -231,6 +261,7 @@ DSP_SRCS-yes            += inv_txfm.h
 DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/inv_txfm_avx2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
@@ -252,6 +283,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+
+DSP_SRCS-$(HAVE_LSX)   += loongarch/idct32x32_lsx.c
 else  # CONFIG_VP9_HIGHBITDEPTH
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct8x8_add_neon.c
@@ -303,10 +336,14 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.c
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.h
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/quantize_avx2.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/quantize_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_quantize_intrin_avx2.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_quantize_neon.c
 endif
 
 # avg
@@ -315,7 +352,12 @@ DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
 DSP_SRCS-$(HAVE_AVX2)  += x86/avg_intrin_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_hadamard_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_avg_neon.c
+endif
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+DSP_SRCS-$(HAVE_LSX)   += loongarch/avg_lsx.c
 ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
@@ -336,20 +378,22 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad4d_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/sad_lsx.c
+
 DSP_SRCS-$(HAVE_MMI)    += mips/sad_mmi.c
 DSP_SRCS-$(HAVE_MMI)    += mips/subtract_mmi.c
 
-DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
-DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
-DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/subtract_avx2.c
 DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
 
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
@@ -359,9 +403,15 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
 DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
 DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 endif  # CONFIG_ENCODERS
@@ -373,13 +423,20 @@ DSP_SRCS-yes            += variance.h
 DSP_SRCS-$(HAVE_NEON)   += arm/avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/variance_neon_dotprod.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/sub_pixel_variance_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/avg_pred_lsx.c
+
 DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
 
 DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/avg_pred_avx2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/variance_vsx.c
@@ -394,6 +451,10 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_sse_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
@@ -413,6 +474,9 @@ DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h
 DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
 DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
 
+# LSX utilities
+DSP_SRCS-$(HAVE_LSX)  += loongarch/bitdepth_conversion_lsx.h
+
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
 
 DSP_SRCS-yes += vpx_dsp_rtcd.c
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 2de4495465..4b946d7560 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -45,9 +45,21 @@ typedef int16_t tran_low_t;
 
 typedef int16_t tran_coef_t;
 
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// produces invalid code for clip_pixel() when the return type is uint8_t.
+// See:
+// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+static INLINE int clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+#else
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }
+#endif
 
 static INLINE int clamp(int value, int low, int high) {
   return value < low ? low : (value > high ? high : value);
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index fd7eefdad0..18087e25d9 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -17,6 +17,10 @@ ()
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 EOF
 }
@@ -38,7 +42,7 @@ ()
 #
 
 add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_4x4 sse2/;
+specialize qw/vpx_d207_predictor_4x4 neon sse2/;
 
 add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_4x4 neon sse2/;
@@ -46,7 +50,7 @@ ()
 add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_4x4 ssse3/;
+specialize qw/vpx_d63_predictor_4x4 neon ssse3/;
 
 add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
@@ -57,12 +61,13 @@ ()
 add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_4x4 neon/;
 
 add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_4x4 neon/;
 
 add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_4x4 ssse3/;
+specialize qw/vpx_d153_predictor_4x4 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
@@ -86,7 +91,7 @@ ()
 specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
 
 add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_8x8 ssse3/;
+specialize qw/vpx_d207_predictor_8x8 neon ssse3/;
 
 add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
@@ -94,19 +99,20 @@ ()
 
 add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
-specialize qw/vpx_d63_predictor_8x8 ssse3/;
+specialize qw/vpx_d63_predictor_8x8 neon ssse3/;
 
 add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
 specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_8x8 ssse3/;
+specialize qw/vpx_d153_predictor_8x8 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
@@ -117,7 +123,7 @@ ()
 
 add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/;
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 lsx/;
 
 add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
@@ -129,24 +135,25 @@ ()
 specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_16x16 ssse3/;
+specialize qw/vpx_d207_predictor_16x16 neon ssse3/;
 
 add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
 
 add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;
+specialize qw/vpx_d63_predictor_16x16 neon ssse3 vsx/;
 
 add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_16x16 ssse3/;
+specialize qw/vpx_d153_predictor_16x16 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
@@ -155,7 +162,7 @@ ()
 specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx lsx/;
 
 add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
@@ -167,24 +174,25 @@ ()
 specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_32x32 ssse3/;
+specialize qw/vpx_d207_predictor_32x32 neon ssse3/;
 
 add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
 
 add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;
+specialize qw/vpx_d63_predictor_32x32 neon ssse3 vsx/;
 
 add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_32x32 ssse3/;
+specialize qw/vpx_d153_predictor_32x32 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
@@ -207,25 +215,25 @@ ()
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d207_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d117_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d153_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
@@ -246,25 +254,25 @@ ()
   specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d207_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d117_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d153_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
@@ -285,25 +293,25 @@ ()
   specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d207_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d117_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d153_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
@@ -324,25 +332,25 @@ ()
   specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d207_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d117_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d153_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
@@ -368,28 +376,28 @@ ()
 # Sub Pixel Filters
 #
 add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
+specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx lsx/;
 
 add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/;
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_scaled_2d ssse3 neon msa/;
@@ -442,37 +450,37 @@ ()
 specialize qw/vpx_lpf_vertical_16 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa lsx/;
 } #CONFIG_VP9
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -527,6 +535,8 @@ ()
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct4x4_1 sse2 neon/;
+  specialize qw/vpx_highbd_fdct4x4_1 neon/;
+  $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8 neon sse2/;
@@ -550,51 +560,53 @@ ()
   specialize qw/vpx_fdct32x32_1 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct4x4 sse2/;
+  specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct8x8 sse2/;
+  specialize qw/vpx_highbd_fdct8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct8x8_1 neon/;
   $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
 
   add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct16x16 sse2/;
+  specialize qw/vpx_highbd_fdct16x16 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct16x16_1 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct32x32 sse2/;
+  specialize qw/vpx_highbd_fdct32x32 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct32x32_rd sse2/;
+  specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32_1 neon/;
 } else {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4 neon sse2 msa/;
+  specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct4x4_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+  specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64";
 
   add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 neon sse2 msa/;
+  specialize qw/vpx_fdct16x16 neon sse2 avx2 msa lsx/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa lsx/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx/;
+  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx lsx/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct32x32_1 sse2 neon msa/;
@@ -629,12 +641,12 @@ ()
   specialize qw/vpx_idct8x8_64_add neon sse2 vsx/;
   specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
   specialize qw/vpx_idct8x8_1_add neon sse2/;
-  specialize qw/vpx_idct16x16_256_add neon sse2 vsx/;
+  specialize qw/vpx_idct16x16_256_add neon sse2 avx2 vsx/;
   specialize qw/vpx_idct16x16_38_add neon sse2/;
   specialize qw/vpx_idct16x16_10_add neon sse2/;
   specialize qw/vpx_idct16x16_1_add neon sse2/;
-  specialize qw/vpx_idct32x32_1024_add neon sse2 vsx/;
-  specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
+  specialize qw/vpx_idct32x32_1024_add neon sse2 avx2 vsx/;
+  specialize qw/vpx_idct32x32_135_add neon sse2 ssse3 avx2/;
   specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_1_add neon sse2/;
   specialize qw/vpx_iwht4x4_16_add sse2 vsx/;
@@ -652,12 +664,13 @@ ()
     $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa;
     specialize qw/vpx_idct16x16_10_add dspr2 msa/;
     specialize qw/vpx_idct16x16_1_add dspr2 msa/;
-    specialize qw/vpx_idct32x32_1024_add dspr2 msa/;
+    specialize qw/vpx_idct32x32_1024_add dspr2 msa lsx/;
     specialize qw/vpx_idct32x32_135_add dspr2 msa/;
     $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
     $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
-    specialize qw/vpx_idct32x32_34_add dspr2 msa/;
-    specialize qw/vpx_idct32x32_1_add dspr2 msa/;
+    $vpx_idct32x32_135_add_lsx=vpx_idct32x32_1024_add_lsx;
+    specialize qw/vpx_idct32x32_34_add dspr2 msa lsx/;
+    specialize qw/vpx_idct32x32_1_add dspr2 msa lsx/;
     specialize qw/vpx_iwht4x4_16_add msa/;
     specialize qw/vpx_iwht4x4_1_add msa/;
   } # !CONFIG_VP9_HIGHBITDEPTH
@@ -709,18 +722,18 @@ ()
 # Quantization
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/;
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/;
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b sse2/;
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+    specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+    specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
@@ -729,40 +742,43 @@ ()
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
+
+add_proto qw/int64_t/, "vpx_sse", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height";
+specialize qw/vpx_sse sse4_1 avx2 neon neon_dotprod/;
 
 #
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x64 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x32 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x64 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x32 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x16 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x32 neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x16 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x8 neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/;
@@ -773,6 +789,45 @@ ()
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
 
+add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x64 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x32 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x64 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x32 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x16 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x32 neon neon_dotprod sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x16 neon neon_dotprod sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x8 neon neon_dotprod sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x16 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x8 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x4 neon/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x8 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x4 neon/;
+
 #
 # Avg
 #
@@ -788,75 +843,74 @@ ()
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon vsx lsx/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+    specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_8x8 avx2/;
+    specialize qw/vpx_highbd_hadamard_8x8 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_16x16 avx2/;
+    specialize qw/vpx_highbd_hadamard_16x16 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_32x32 avx2/;
+    specialize qw/vpx_highbd_hadamard_32x32 avx2 neon/;
 
     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon/;
 
     add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length";
-    specialize qw/vpx_highbd_satd avx2/;
+    specialize qw/vpx_highbd_satd avx2 neon/;
   } else {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+    specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
 
     add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon msa/;
   }
 
-  add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/vpx_int_pro_row sse2 neon msa/;
-
+  add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
+  specialize qw/vpx_int_pro_row neon sse2 msa/;
   add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/vpx_int_pro_col sse2 neon msa/;
+  specialize qw/vpx_int_pro_col neon sse2 msa/;
 
   add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
   specialize qw/vpx_vector_var neon sse2 msa/;
 }  # CONFIG_VP9_ENCODER
 
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x16_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x32_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x16_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x8_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/;
@@ -874,84 +928,85 @@ ()
 specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
 
 #
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+# Multi-block SAD, comparing a reference to N independent blocks
 #
-# Blocks of 3
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x32x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x64x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x32x4d avx2 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x16x4d neon neon_dotprod msa sse2 vsx mmi/;
 
-# Blocks of 8
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x8 avx2/;
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x32x4d neon neon_dotprod msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x16x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x8x4d neon neon_dotprod msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/;
 
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
-#
-# Multi-block SAD, comparing a reference to N independent blocks
-#
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x64x4d neon neon_dotprod avx2 sse2/;
 
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x32x4d neon neon_dotprod avx2 sse2/;
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x64x4d neon neon_dotprod avx2 sse2/;
 
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x32x4d neon neon_dotprod avx2 sse2/;
 
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x16x4d neon neon_dotprod avx2 sse2/;
 
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x32x4d neon neon_dotprod sse2/;
 
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
+add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x16x4d neon neon_dotprod sse2/;
 
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
+add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x8x4d neon neon_dotprod sse2/;
 
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
+add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x16x4d neon sse2/;
 
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
+add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x8x4d neon sse2/;
 
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
+add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x4x4d neon/;
+
+add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x8x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x4x4d neon/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
 specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
@@ -972,136 +1027,223 @@ ()
   # Block subtraction
   #
   add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block neon avx2/;
+
+  add_proto qw/int64_t/, "vpx_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+  specialize qw/vpx_highbd_sse sse4_1 avx2 neon/;
 
   #
   # Single block SAD
   #
   add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64 sse2/;
+  specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32 sse2/;
+  specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x64 sse2/;
+  specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x32 sse2/;
+  specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x16 sse2/;
+  specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32 sse2/;
+  specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16 sse2/;
+  specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8 sse2/;
+  specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x16 sse2/;
+  specialize qw/vpx_highbd_sad8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x8 sse2/;
+  specialize qw/vpx_highbd_sad8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x4 sse2/;
+  specialize qw/vpx_highbd_sad8x4 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x8 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x4 neon/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x64 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x32 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x64 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x32 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x16 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x32 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x16 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x8 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x16 neon sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x8 neon sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x4 neon/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_4x8 neon/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_4x4 neon/;
 
   #
   # Avg
   #
   add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
-  specialize qw/vpx_highbd_avg_8x8 sse2/;
+  specialize qw/vpx_highbd_avg_8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
-  specialize qw/vpx_highbd_avg_4x4 sse2/;
+  specialize qw/vpx_highbd_avg_4x4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
+  specialize qw/vpx_highbd_minmax_8x8 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x64_avg sse2/;
+  specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x32_avg sse2/;
+  specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x64_avg sse2/;
+  specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x32_avg sse2/;
+  specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x16_avg sse2/;
+  specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x32_avg sse2/;
+  specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x16_avg sse2/;
+  specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x8_avg sse2/;
+  specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x16_avg sse2/;
+  specialize qw/vpx_highbd_sad8x16_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x8_avg sse2/;
+  specialize qw/vpx_highbd_sad8x8_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x4_avg sse2/;
+  specialize qw/vpx_highbd_sad8x4_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x8_avg neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x4_avg neon/;
 
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x4d sse2/;
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
+
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
+
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
+
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/;
+
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/;
+
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
+
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/;
+
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/;
+
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
+
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad8x8x4d sse2 neon/;
+
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad8x4x4d sse2 neon/;
+
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad4x8x4d sse2 neon/;
+
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
+
+  add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x64x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x32x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x32x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x64x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x64x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x32x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x16x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x16x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x32x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x32x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x16x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x8x4d neon sse2 avx2/;
 
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x16x4d neon sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x8x4d neon sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x4x4d neon/;
 
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_4x8x4d neon sse2/;
 
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x4d sse2/;
+  add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_4x4x4d neon/;
 
   #
   # Structured Similarity (SSIM)
@@ -1118,73 +1260,73 @@ ()
 # Variance
 #
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance64x64 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance64x32 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance32x64 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance32x32 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance32x16 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance16x32 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x16 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x8 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x4 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance4x8 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance4x4 sse2 neon neon_dotprod msa mmi vsx/;
 
 #
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx/;
+  specialize qw/vpx_get16x16var sse2 avx2 neon neon_dotprod msa vsx lsx/;
 
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var sse2 neon msa vsx/;
+  specialize qw/vpx_get8x8var sse2 neon neon_dotprod msa vsx/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_mse16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/;
+  specialize qw/vpx_mse16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa mmi vsx/;
+  specialize qw/vpx_mse8x16 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa mmi vsx/;
+  specialize qw/vpx_mse8x8 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;
 
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride";
-  specialize qw/vpx_get4x4sse_cs neon msa vsx/;
+  specialize qw/vpx_get4x4sse_cs neon neon_dotprod msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  specialize qw/vpx_comp_avg_pred neon sse2 vsx/;
+  specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/;
 
 #
 # Subpixel Variance
@@ -1199,7 +1341,7 @@ ()
   specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;
@@ -1208,7 +1350,7 @@ ()
   specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
@@ -1217,7 +1359,7 @@ ()
   specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;
@@ -1229,7 +1371,7 @@ ()
   specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;
@@ -1269,369 +1411,397 @@ ()
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x64 sse2/;
+  specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x32 sse2/;
+  specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x64 sse2/;
+  specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x32 sse2/;
+  specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x16 sse2/;
+  specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x32 sse2/;
+  specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x16 sse2/;
+  specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x8 sse2/;
+  specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x16 sse2/;
+  specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x8 sse2/;
+  specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x4 neon/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance4x8 neon/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance4x4 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x64 sse2/;
+  specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x32 sse2/;
+  specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x64 sse2/;
+  specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x32 sse2/;
+  specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x16 sse2/;
+  specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x32 sse2/;
+  specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x16 sse2/;
+  specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x8 sse2/;
+  specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x16 sse2/;
+  specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x8 sse2/;
+  specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x4 neon/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance4x8 neon/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance4x4 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x64 sse2/;
+  specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x32 sse2/;
+  specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x64 sse2/;
+  specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x32 sse2/;
+  specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x16 sse2/;
+  specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x32 sse2/;
+  specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x16 sse2/;
+  specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x8 sse2/;
+  specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x16 sse2/;
+  specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x8 sse2/;
+  specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x4 neon/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance4x8 neon/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance4x4 neon/;
 
   add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get16x16var sse2/;
+  specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
 
   add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get8x8var sse2/;
+  specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
 
   add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get16x16var sse2/;
+  specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
 
   add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get8x8var sse2/;
+  specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
 
   add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get16x16var sse2/;
+  specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
 
   add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get8x8var sse2/;
+  specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2/;
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2/;
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x16 sse2/;
+  specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x8 sse2/;
+  specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x16 sse2/;
+  specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x8 sse2/;
+  specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
+  specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
 
   #
   # Subpixel Variance
   #
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
 
 }  # CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h
index 54357ee6ca..0cddcb6991 100644
--- a/vpx_dsp/vpx_filter.h
+++ b/vpx_dsp/vpx_filter.h
@@ -29,10 +29,16 @@ typedef int16_t InterpKernel[SUBPEL_TAPS];
 
 static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
   assert(filter[3] != 128);
-  if (!filter[0] && !filter[1] && !filter[2])
-    return 2;
-  else
+  if (filter[0] | filter[7]) {
     return 8;
+  }
+  if (filter[1] | filter[6]) {
+    return 6;
+  }
+  if (filter[2] | filter[5]) {
+    return 4;
+  }
+  return 2;
 }
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index 3f4f577a21..61e4e73c5b 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -104,7 +104,7 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
   src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
   src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
-  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
 
   src32[0] = _mm256_cvtepi16_epi32(src16[0]);
   src32[1] = _mm256_cvtepi16_epi32(src16[1]);
@@ -218,6 +218,14 @@ void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+                                                   __m256i *out_lo,
+                                                   __m256i *out_hi) {
+  const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
+  *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
+}
+
 static void hadamard_col8x2_avx2(__m256i *in, int iter) {
   __m256i a0 = in[0];
   __m256i a1 = in[1];
@@ -304,7 +312,7 @@ static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
   src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
   src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
-  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
 
   hadamard_col8x2_avx2(src, 0);
   hadamard_col8x2_avx2(src, 1);
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   int16_t *t_coeff = coeff;
 #endif
   int idx;
+  __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m256i b0, b1, b2, b3;
+  const __m256i zero = _mm256_setzero_si256();
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
     const int16_t *src_ptr =
@@ -414,15 +428,38 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
     const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
     const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
 
-    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
-    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
-    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
-    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm256_srai_epi32(b0_lo, 2);
+    b1_lo = _mm256_srai_epi32(b1_lo, 2);
+    b2_lo = _mm256_srai_epi32(b2_lo, 2);
+    b3_lo = _mm256_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm256_srai_epi32(b0_hi, 2);
+    b1_hi = _mm256_srai_epi32(b1_hi, 2);
+    b2_hi = _mm256_srai_epi32(b2_hi, 2);
+    b3_hi = _mm256_srai_epi32(b3_hi, 2);
 
-    b0 = _mm256_srai_epi16(b0, 2);
-    b1 = _mm256_srai_epi16(b1, 2);
-    b2 = _mm256_srai_epi16(b2, 2);
-    b3 = _mm256_srai_epi16(b3, 2);
+    b0 = _mm256_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm256_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm256_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm256_packs_epi32(b3_lo, b3_hi);
 
     store_tran_low(_mm256_add_epi16(b0, b2), coeff);
     store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 3cba258f61..4447dfab7c 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -15,6 +15,14 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_ports/mem.h"
 
+static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+                                                   __m128i *out_lo,
+                                                   __m128i *out_hi) {
+  const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
+  *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
                          int *min, int *max) {
   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
@@ -164,7 +172,7 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
   s0 = _mm_add_epi32(s0, s1);
   s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
   s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
-  avg = _mm_cvtsi128_si32(s0);
+  avg = (unsigned int)_mm_cvtsi128_si32(s0);
 
   return (avg + 32) >> 6;
 }
@@ -275,7 +283,7 @@ static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
   src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
   src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
   src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
 
   hadamard_col8_sse2(src, 0);
   hadamard_col8_sse2(src, 1);
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
   int16_t *t_coeff = coeff;
 #endif
   int idx;
+  __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m128i b0, b1, b2, b3;
+  const __m128i zero = _mm_setzero_si128();
   for (idx = 0; idx < 4; ++idx) {
     const int16_t *src_ptr =
         src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
@@ -413,15 +427,38 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
     __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
     __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
 
-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm_srai_epi32(b0_lo, 2);
+    b1_lo = _mm_srai_epi32(b1_lo, 2);
+    b2_lo = _mm_srai_epi32(b2_lo, 2);
+    b3_lo = _mm_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm_srai_epi32(b0_hi, 2);
+    b1_hi = _mm_srai_epi32(b1_hi, 2);
+    b2_hi = _mm_srai_epi32(b2_hi, 2);
+    b3_hi = _mm_srai_epi32(b3_hi, 2);
 
-    b0 = _mm_srai_epi16(b0, 2);
-    b1 = _mm_srai_epi16(b1, 2);
-    b2 = _mm_srai_epi16(b2, 2);
-    b3 = _mm_srai_epi16(b3, 2);
+    b0 = _mm_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm_packs_epi32(b3_lo, b3_hi);
 
     coeff0 = _mm_add_epi16(b0, b2);
     coeff1 = _mm_add_epi16(b1, b3);
@@ -464,7 +501,7 @@ int vpx_satd_sse2(const tran_low_t *coeff, int length) {
   return _mm_cvtsi128_si32(accum);
 }
 
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c
new file mode 100644
index 0000000000..f4357998c9
--- /dev/null
+++ b/vpx_dsp/x86/avg_pred_avx2.c
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int row = 0;
+  // comp_pred and pred must be 32 byte aligned.
+  assert(((intptr_t)comp_pred % 32) == 0);
+  assert(((intptr_t)pred % 32) == 0);
+
+  if (width == 8) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i p = _mm256_load_si256((const __m256i *)pred);
+      const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+      const __m128i r_1 =
+          _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+
+      const __m128i r1 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride)));
+      const __m128i r2 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride)));
+
+      const __m256i ref_0123 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
+      const __m256i avg = _mm256_avg_epu8(p, ref_0123);
+
+      _mm256_store_si256((__m256i *)comp_pred, avg);
+
+      row += 4;
+      pred += 32;
+      comp_pred += 32;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 16) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i tmp0 =
+          _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref));
+      const __m256i ref_0 = _mm256_inserti128_si256(
+          tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+      const __m256i tmp1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+      const __m256i ref_1 = _mm256_inserti128_si256(
+          tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 4;
+      pred += 64;
+      comp_pred += 64;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 32) {
+    assert(height % 2 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref);
+      const __m256i ref_1 =
+          _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 2;
+      pred += 64;
+      comp_pred += 64;
+      ref += 2 * ref_stride;
+    } while (row < height);
+  } else if (width % 64 == 0) {
+    do {
+      int x;
+      for (x = 0; x < width; x += 64) {
+        const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x));
+        const __m256i pred_1 =
+            _mm256_load_si256((const __m256i *)(pred + x + 32));
+        const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+        const __m256i ref_1 =
+            _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+        const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+        const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+        _mm256_store_si256((__m256i *)(comp_pred + x), average_0);
+        _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1);
+      }
+      row++;
+      pred += width;
+      comp_pred += width;
+      ref += ref_stride;
+    } while (row < height);
+  } else {
+    vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride);
+  }
+}
diff --git a/vpx_dsp/x86/avg_pred_sse2.c b/vpx_dsp/x86/avg_pred_sse2.c
index e4e1e0e7a2..c6e70f744e 100644
--- a/vpx_dsp/x86/avg_pred_sse2.c
+++ b/vpx_dsp/x86/avg_pred_sse2.c
@@ -46,9 +46,9 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
         r = _mm_loadu_si128((const __m128i *)ref);
         ref += 16;
       } else if (width == 4) {
-        r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride),
-                          loadu_uint32(ref + 2 * ref_stride),
-                          loadu_uint32(ref + ref_stride), loadu_uint32(ref));
+        r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride),
+                          loadu_int32(ref + 2 * ref_stride),
+                          loadu_int32(ref + ref_stride), loadu_int32(ref));
 
         ref += 4 * ref_stride;
       } else {
diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h
index 99bc9637fc..ebee964b18 100644
--- a/vpx_dsp/x86/convolve_avx2.h
+++ b/vpx_dsp/x86/convolve_avx2.h
@@ -129,9 +129,8 @@ static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
 static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
                                        __m128i *const dst_ptr_2,
                                        const __m256i *const src) {
-  *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
-  *((uint32_t *)(dst_ptr_2)) =
-      _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+  *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+  *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
 }
 
 static INLINE __m256i mm256_round_epi32(const __m256i *const src,
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 3f158b5e4e..f3a8020292 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -89,7 +89,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
   const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
   const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
   const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  const __m256i kZero = _mm256_set1_epi16(0);
+  const __m256i kZero = _mm256_setzero_si256();
   const __m256i kOne = _mm256_set1_epi16(1);
   // Do the two transform/transpose passes
   int pass;
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index ac1246faa5..bf350b6da0 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -100,7 +100,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
   const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   const __m128i kOne = _mm_set1_epi16(1);
 
   // Do the two transform/transpose passes
diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c
index a2ed420e37..c8f54a49cb 100644
--- a/vpx_dsp/x86/fwd_txfm_avx2.c
+++ b/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -8,9 +8,382 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <immintrin.h>  // AVX2
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/txfm_common.h"
+#define ADD256_EPI16 _mm256_add_epi16
+#define SUB256_EPI16 _mm256_sub_epi16
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size, int pass) {
+  int i;
+  const __m256i kOne = _mm256_set1_epi16(1);
+  if (pass == 0) {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+      // x = x << 2
+      out[i] = _mm256_slli_epi16(out[i], 2);
+    }
+  } else {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16));
+      // x = (x + 1) >> 2
+      out[i] = _mm256_add_epi16(out[i], kOne);
+      out[i] = _mm256_srai_epi16(out[i], 2);
+    }
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+                                       __m256i *const out) {
+  int i;
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + idx] =                                                               \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + idx] = _mm256_inserti128_si256(                                      \
+      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+// Store 8 16-bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+                                                        tran_low_t *out,
+                                                        const int stride,
+                                                        const int out_size) {
+  int i;
+  for (i = 0; i < out_size; ++i) {
+    _mm256_storeu_si256((__m256i *)(out), in[i]);
+    out += stride;
+  }
+}
+
+#define PAIR256_SET_EPI16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE __m256i mult256_round_shift(const __m256i *pin0,
+                                          const __m256i *pin1,
+                                          const __m256i *pmultiplier,
+                                          const __m256i *prounding,
+                                          const int shift) {
+  const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier);
+  const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier);
+  const __m256i v0 = _mm256_add_epi32(u0, *prounding);
+  const __m256i v1 = _mm256_add_epi32(u1, *prounding);
+  const __m256i w0 = _mm256_srai_epi32(v0, shift);
+  const __m256i w1 = _mm256_srai_epi32(v1, shift);
+  return _mm256_packs_epi32(w0, w1);
+}
+
+static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) {
+  int i;
+  __m256i step2[4];
+  __m256i in[8];
+  __m256i step1[8];
+  __m256i step3[8];
+
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+  const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64);
+  const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64);
+  const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64);
+  const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64);
+  const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64);
+  const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64);
+  const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64);
+  const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64);
+  const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64);
+  const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64);
+  const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64);
+  const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64);
+  const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64);
+  const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64);
+  const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+
+  // Calculate input for the first 8 results.
+  for (i = 0; i < 8; i++) {
+    in[i] = ADD256_EPI16(input[i], input[15 - i]);
+  }
+
+  // Calculate input for the next 8 results.
+  for (i = 0; i < 8; i++) {
+    step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]);
+  }
+
+  // Work on the first eight values; fdct8(input, even_results);
+  {
+    // Add/subtract
+    const __m256i q0 = ADD256_EPI16(in[0], in[7]);
+    const __m256i q1 = ADD256_EPI16(in[1], in[6]);
+    const __m256i q2 = ADD256_EPI16(in[2], in[5]);
+    const __m256i q3 = ADD256_EPI16(in[3], in[4]);
+    const __m256i q4 = SUB256_EPI16(in[3], in[4]);
+    const __m256i q5 = SUB256_EPI16(in[2], in[5]);
+    const __m256i q6 = SUB256_EPI16(in[1], in[6]);
+    const __m256i q7 = SUB256_EPI16(in[0], in[7]);
+
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m256i r0 = ADD256_EPI16(q0, q3);
+      const __m256i r1 = ADD256_EPI16(q1, q2);
+      const __m256i r2 = SUB256_EPI16(q1, q2);
+      const __m256i r3 = SUB256_EPI16(q0, q3);
+
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      {
+        const __m256i t0 = _mm256_unpacklo_epi16(r0, r1);
+        const __m256i t1 = _mm256_unpackhi_epi16(r0, r1);
+        const __m256i t2 = _mm256_unpacklo_epi16(r2, r3);
+        const __m256i t3 = _mm256_unpackhi_epi16(r2, r3);
+
+        output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[12] =
+            mult256_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      }
+    }
+
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      const __m256i d0 = _mm256_unpacklo_epi16(q6, q5);
+      const __m256i d1 = _mm256_unpackhi_epi16(q6, q5);
+      const __m256i r0 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      const __m256i r1 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+
+      {
+        // Add/subtract
+        const __m256i x0 = ADD256_EPI16(q4, r0);
+        const __m256i x1 = SUB256_EPI16(q4, r0);
+        const __m256i x2 = SUB256_EPI16(q7, r1);
+        const __m256i x3 = ADD256_EPI16(q7, r1);
+
+        // Interleave to do the multiply by constants which gets us
+        // into 32 bits.
+        {
+          const __m256i t0 = _mm256_unpacklo_epi16(x0, x3);
+          const __m256i t1 = _mm256_unpackhi_epi16(x0, x3);
+          const __m256i t2 = _mm256_unpacklo_epi16(x1, x2);
+          const __m256i t3 = _mm256_unpackhi_epi16(x1, x2);
+          output[2] =
+              mult256_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[14] =
+              mult256_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[10] =
+              mult256_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[6] =
+              mult256_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        }
+      }
+    }
+  }
+  // Work on the next eight values; step1 -> odd_results
+  {  // step 2
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 3
+    {
+      step3[0] = ADD256_EPI16(step1[0], step2[1]);
+      step3[1] = ADD256_EPI16(step1[1], step2[0]);
+      step3[2] = SUB256_EPI16(step1[1], step2[0]);
+      step3[3] = SUB256_EPI16(step1[0], step2[1]);
+      step3[4] = SUB256_EPI16(step1[7], step2[3]);
+      step3[5] = SUB256_EPI16(step1[6], step2[2]);
+      step3[6] = ADD256_EPI16(step1[6], step2[2]);
+      step3[7] = ADD256_EPI16(step1[7], step2[3]);
+    }
+    // step 4
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 5
+    {
+      step1[0] = ADD256_EPI16(step3[0], step2[0]);
+      step1[1] = SUB256_EPI16(step3[0], step2[0]);
+      step1[2] = ADD256_EPI16(step3[3], step2[1]);
+      step1[3] = SUB256_EPI16(step3[3], step2[1]);
+      step1[4] = SUB256_EPI16(step3[4], step2[3]);
+      step1[5] = ADD256_EPI16(step3[4], step2[3]);
+      step1[6] = SUB256_EPI16(step3[7], step2[2]);
+      step1[7] = ADD256_EPI16(step3[7], step2[2]);
+    }
+    // step 6
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]);
+      output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]);
+      output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+  }
+}
+
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  DECLARE_ALIGNED(32, int16_t, intermediate[256]);
+  int16_t *out0 = intermediate;
+  tran_low_t *out1 = output;
+  const int width = 16;
+  const int height = 16;
+  __m256i buf0[16], buf1[16];
+
+  // Two transform and transpose passes
+  // Process 16 columns (transposed rows in second pass) at a time.
+  for (pass = 0; pass < 2; ++pass) {
+    // Load and pre-condition input.
+    load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass);
+
+    // Calculate dct for 16x16 values
+    fdct16x16_1D_avx2(buf1, buf0);
+
+    // Transpose the results.
+    transpose_16bit_16x16_avx2(buf0, buf1);
+
+    if (pass == 0) {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height);
+    } else {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height);
+    }
+    // Setup in/out for next pass.
+    input = intermediate;
+  }
+}
+
 #if !CONFIG_VP9_HIGHBITDEPTH
 #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
 #define FDCT32x32_HIGH_PRECISION 0
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c
index 3209625617..01a52ec8bf 100644
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1465,10 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
 #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
   vpx_highbd_filter_block1d4_h8_avg_avx2
 
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), , avx2, 0);
-HIGH_FUN_CONV_2D(, avx2, 0);
+                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
+HIGH_FUN_CONV_2D(, avx2, 0)
 
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1487,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   vpx_highbd_filter_block1d4_v2_avg_sse2
 
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
-HIGH_FUN_CONV_2D(avg_, avx2, 1);
+                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+HIGH_FUN_CONV_2D(avg_, avx2, 1)
 
 #undef HIGHBD_FUNC
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
index 78cf9111d9..1d07391b02 100644
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -249,7 +249,7 @@ static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
 
 static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
                                 const int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   // Faster than _mm_set1_epi16((1 << bd) - 1).
   const __m128i one = _mm_set1_epi16(1);
   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index d265fc1a92..9f45623dee 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -18,7 +18,7 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
   __m128i lbounded;
   __m128i retval;
 
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   __m128i t80, max, min;
 
@@ -51,7 +51,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   __m128i blimit_v, limit_v, thresh_v;
   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
@@ -492,7 +492,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
   DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
   __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
@@ -720,7 +720,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
   __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 0000000000..35ca554049
--- /dev/null
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,254 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i sign = _mm_srai_epi16(*p, 15);
+  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
+  int i;
+  for (i = 0; i < 5; ++i) {
+    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+  }
+}
+
+static VPX_FORCE_INLINE void init_qp(
+    const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr,
+    __m256i *qp, int log_scale) {
+  const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin);
+  const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+  const __m128i quant_shift =
+      _mm_loadu_si128((const __m128i *)mb_plane->quant_shift);
+  init_one_qp(&zbin, &qp[0]);
+  init_one_qp(&round, &qp[1]);
+  init_one_qp(&quant, &qp[2]);
+  init_one_qp(&dequant, &qp[3]);
+  init_one_qp(&quant_shift, &qp[4]);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+    qp[0] = _mm256_add_epi32(qp[0], rnd);
+    qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+    qp[1] = _mm256_add_epi32(qp[1], rnd);
+    qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+  }
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16.  The output, 16 int32_t is save in *p.
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+                                                      const __m256i *y) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+                                                 __m256i eobmax,
+                                                 __m256i nz_mask) {
+  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+  __m256i eob_s;
+  eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+  return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize(const __m256i *qp,
+                                      const tran_low_t *coeff_ptr,
+                                      const int16_t *iscan_ptr,
+                                      tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                      __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (_mm256_movemask_epi8(zbin_mask) == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+  {
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+    const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+    const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+    const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+    const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+    const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+    const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+    const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  }
+}
+
+void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *const mb_plane,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const struct ScanOrder *const scan_order) {
+  const int step = 8;
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+  const int16_t *iscan = scan_order->iscan;
+
+  init_qp(mb_plane, dequant_ptr, qp, 0);
+
+  quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
+
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE void quantize_b_32x32(
+    const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (_mm256_movemask_epi8(zbin_mask) == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+
+  {
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+    // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+    const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+    // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1);
+    const __m256i abs_dq =
+        _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1);
+    const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+    const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+    const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  }
+}
+
+void vpx_highbd_quantize_b_32x32_avx2(
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+  const unsigned int step = 8;
+  intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+
+  init_qp(mb_plane, dequant_ptr, qp, 1);
+
+  quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 7149e4fb74..adae60756d 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,19 +15,22 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
-#if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                int skip_block, const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+                                const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+                                const struct ScanOrder *const scan_order) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
   __m128i zbins[2];
   __m128i nzbins[2];
+  const int16_t *iscan = scan_order->iscan;
+  const int16_t *zbin_ptr = mb_plane->zbin;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
                            (int)zbin_ptr[0]);
@@ -38,10 +41,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 
-  (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
-
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
 
@@ -84,31 +83,29 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
         const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
         const uint32_t abs_qcoeff =
             (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        qcoeff_ptr[k] =
+            (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
         dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
         if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
       }
     }
   }
-  *eob_ptr = eob_i + 1;
+  *eob_ptr = eob_i;
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
-  int i, eob = -1;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
-  (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
+  int i, eob = 0;
+  const intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
@@ -143,14 +140,14 @@ void vpx_highbd_quantize_b_32x32_sse2(
     const int coeff = coeff_ptr[rc];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
-        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+        (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
-  *eob_ptr = eob + 1;
+  *eob_ptr = eob;
 }
-#endif
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
new file mode 100644
index 0000000000..e483fdce73
--- /dev/null
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -0,0 +1,462 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+                                          uint32_t sad_array[4]) {
+  const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+  const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+  const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+                                    _mm256_extractf128_si256(t2, 1));
+  _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    int x;
+
+    for (x = 0; x < 4; ++x) {
+      __m256i r[4];
+      r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
+      r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
+      r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
+      r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
+
+      // absolute differences between every ref[] to src
+      r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
+      r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
+      r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
+      r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
+
+      // sum every abs diff
+      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
+      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
+    }
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 2); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 1;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD64XNX4D(n)                                                   \
+  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
+  }
+
+#define HIGHBD_SADSKIP64XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_64x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; i++) {
+    __m256i r[8];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
+    r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
+    r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
+    r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
+
+    // absolute differences between every ref[] to src
+    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
+    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
+    r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
+    r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
+    r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
+    r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
+
+    // sum every abs diff
+    sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
+    sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
+    sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
+    sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 8); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 3;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD32XNX4D(n)                                                   \
+  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
+  }
+
+#define HIGHBD_SADSKIP32XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_32x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; i++) {
+    __m256i r[4];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+    // absolute differences between every ref[] to src
+    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
+    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
+
+    // sum every abs diff
+    sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
+    sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
+    sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
+    sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < num_iters; ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 4;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD16XNX4D(n)                                                   \
+  void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
+  }
+
+#define HIGHBD_SADSKIP16XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_16x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_array[4],
+                                 int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_16[0] = _mm256_setzero_si256();
+  sums_16[1] = _mm256_setzero_si256();
+  sums_16[2] = _mm256_setzero_si256();
+  sums_16[3] = _mm256_setzero_si256();
+
+  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+  {
+    __m256i sums_32[4];
+    sums_32[0] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+    sums_32[1] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+    sums_32[2] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+    sums_32[3] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+    calc_final_4(sums_32, sad_array);
+  }
+}
+
+void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *const ref_array[4],
+                                int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_16[0] = _mm256_setzero_si256();
+  sums_16[1] = _mm256_setzero_si256();
+  sums_16[2] = _mm256_setzero_si256();
+  sums_16[3] = _mm256_setzero_si256();
+
+  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+  {
+    __m256i sums_32[4];
+    sums_32[0] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+    sums_32[1] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+    sums_32[2] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+    sums_32[3] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+    calc_final_4(sums_32, sad_array);
+  }
+}
+
+// clang-format off
+HIGHBD_SAD64XNX4D(64)
+HIGHBD_SADSKIP64XNx4D(64)
+
+HIGHBD_SAD64XNX4D(32)
+HIGHBD_SADSKIP64XNx4D(32)
+
+HIGHBD_SAD32XNX4D(64)
+HIGHBD_SADSKIP32XNx4D(64)
+
+HIGHBD_SAD32XNX4D(32)
+HIGHBD_SADSKIP32XNx4D(32)
+
+HIGHBD_SAD32XNX4D(16)
+HIGHBD_SADSKIP32XNx4D(16)
+
+HIGHBD_SAD16XNX4D(32)
+HIGHBD_SADSKIP16XNx4D(32)
+
+HIGHBD_SADSKIP16XNx4D(16)
+
+HIGHBD_SADSKIP16XNx4D(8)
+    // clang-format on
diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm
index 6c2a61e019..a07892d811 100644
--- a/vpx_dsp/x86/highbd_sad4d_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -213,7 +213,12 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0  ; normal sad
 %if UNIX64
 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
 cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
+%else  ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif  ;
+%endif  ; sad/avg/skip
 
 ; set m1
   push                srcq
@@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   pshufd                m1, m1, 0x0
   pop                 srcq
 
+%if %3 == 2  ; skip rows
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif  ; skip rows
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
   mov                ref2q, [ref1q+gprsize*1]
@@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   shl                ref1q, 1
 
   HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 2  ;  Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
   ; N.B. HIGH_PROCESS outputs dwords (32 bits)
   ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
@@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   paddd                 m4, m0
   paddd                 m6, m1
   punpcklqdq            m4, m6
+%if %3 == 2  ; skip rows
+  pslld                 m4, 1
+%endif
   movifnidn             r4, r4mp
   movu                [r4], m4
   RET
@@ -285,3 +312,15 @@ HIGH_SADNXN4D  8,  8
 HIGH_SADNXN4D  8,  4
 HIGH_SADNXN4D  4,  8
 HIGH_SADNXN4D  4,  4
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16,  8, 2
+HIGH_SADNXN4D  8, 16, 2
+HIGH_SADNXN4D  8,  8, 2
+HIGH_SADNXN4D  4,  8, 2
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 0000000000..78f8eb8bfa
--- /dev/null
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,522 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
+  const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
+  const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
+                                    _mm256_extractf128_si256(t1, 1));
+  return (unsigned int)_mm_cvtsi128_si32(sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
+    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
+    // sum every abs diff
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < (n / 2); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+  }
+  return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD64XN(n)                                                      \
+  unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP64xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_64x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
+
+static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < (n / 8); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 3;
+    ref += ref_stride << 3;
+  }
+  return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD32XN(n)                                                      \
+  unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP32xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_32x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
+
+static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; i += 2) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+  }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
+  int i;
+
+  for (i = 0; i < num_iters; ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 4;
+    ref += ref_stride << 4;
+  }
+  return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD16XN(n)                                                      \
+  unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP16xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_16x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
+
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+// clang-format off
+HIGHBD_SAD64XN(64)
+HIGHBD_SADSKIP64xN(64)
+HIGHBD_SAD64XN(32)
+HIGHBD_SADSKIP64xN(32)
+HIGHBD_SAD32XN(64)
+HIGHBD_SADSKIP32xN(64)
+HIGHBD_SAD32XN(32)
+HIGHBD_SADSKIP32xN(32)
+HIGHBD_SAD32XN(16)
+HIGHBD_SADSKIP32xN(16)
+HIGHBD_SAD16XN(32)
+HIGHBD_SADSKIP16xN(32)
+HIGHBD_SADSKIP16xN(16)
+HIGHBD_SADSKIP16xN(8)
+//clang-format on
+
+// AVG -------------------------------------------------------------------------
+static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
+    const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    const __m256i avg2 = _mm256_avg_epu16(r2, x2);
+    const __m256i avg3 = _mm256_avg_epu16(r3, x3);
+    // absolute differences between every ref/pred avg to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
+    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
+    // sum every abs diff
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+    src += src_stride;
+    ref += ref_stride;
+    sec += 64;
+  }
+}
+
+#define HIGHBD_SAD64XN_AVG(n)                                                 \
+  unsigned int vpx_highbd_sad64x##n##_avg_avx2(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
+    __m256i sums_32 = _mm256_setzero_si256();                                 \
+    int i;                                                                    \
+                                                                              \
+    for (i = 0; i < (n / 2); ++i) {                                           \
+      __m256i sums_16 = _mm256_setzero_si256();                               \
+                                                                              \
+      highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
+                                                                              \
+      /* sums_16 will outrange after 2 rows, so add current sums_16 to        \
+       * sums_32*/                                                            \
+      sums_32 = _mm256_add_epi32(                                             \
+          sums_32,                                                            \
+          _mm256_add_epi32(                                                   \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
+                                                                              \
+      src += src_stride << 1;                                                 \
+      ref += ref_stride << 1;                                                 \
+      sec += 64 << 1;                                                         \
+    }                                                                         \
+    return calc_final(sums_32);                                               \
+  }
+
+// 64x64
+HIGHBD_SAD64XN_AVG(64)
+
+// 64x32
+HIGHBD_SAD64XN_AVG(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    // absolute differences between every ref/pred avg to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride;
+    ref += ref_stride;
+    sec += 32;
+  }
+}
+
+#define HIGHBD_SAD32XN_AVG(n)                                                 \
+  unsigned int vpx_highbd_sad32x##n##_avg_avx2(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
+    __m256i sums_32 = _mm256_setzero_si256();                                 \
+    int i;                                                                    \
+                                                                              \
+    for (i = 0; i < (n / 8); ++i) {                                           \
+      __m256i sums_16 = _mm256_setzero_si256();                               \
+                                                                              \
+      highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
+                                                                              \
+      /* sums_16 will outrange after 8 rows, so add current sums_16 to        \
+       * sums_32*/                                                            \
+      sums_32 = _mm256_add_epi32(                                             \
+          sums_32,                                                            \
+          _mm256_add_epi32(                                                   \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
+                                                                              \
+      src += src_stride << 3;                                                 \
+      ref += ref_stride << 3;                                                 \
+      sec += 32 << 3;                                                         \
+    }                                                                         \
+    return calc_final(sums_32);                                               \
+  }
+
+// 32x64
+HIGHBD_SAD32XN_AVG(64)
+
+// 32x32
+HIGHBD_SAD32XN_AVG(32)
+
+// 32x16
+HIGHBD_SAD32XN_AVG(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; i += 2) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+    sec += 32;
+  }
+}
+
+unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
+                                          const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 4;
+    ref += ref_stride << 4;
+    sec += 16 << 4;
+  }
+  return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
+                                          const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm
index 6a1a6f3d62..62ad2237ff 100644
--- a/vpx_dsp/x86/highbd_sad_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -12,6 +12,11 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro HIGH_SAD_FN 4
 %if %4 == 0
 %if %3 == 5
@@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+%elif %4 == 1 ; avg
 %if %3 == 5
 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%else  ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2  ; double the stride if we are skipping rows
+  lea          src_strided, [src_strided*2]
+  lea          ref_strided, [ref_strided*2]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD64XN 1-2 0
   HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
 
 
 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD32XN 1-2 0
   HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
 
 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD16XN 1-2 0
   HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -292,13 +334,19 @@ HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
 
 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
   HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -350,6 +398,9 @@ HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -361,3 +412,5 @@ HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index dd6cfbb2c4..381e0ad193 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <emmintrin.h>  // SSE2
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -121,8 +122,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
     *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
   }
 
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
+HIGH_GET_VAR(16)
+HIGH_GET_VAR(8)
 
 #undef HIGH_GET_VAR
 
@@ -167,16 +168,16 @@ HIGH_GET_VAR(8);
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
 
 #undef VAR_FN
 
@@ -255,10 +256,10 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
       const uint16_t *ref, ptrdiff_t ref_stride, int height,                 \
       unsigned int *sse, void *unused0, void *unused);
 #define DECLS(opt) \
-  DECL(8, opt);    \
+  DECL(8, opt)     \
   DECL(16, opt)
 
-DECLS(sse2);
+DECLS(sse2)
 
 #undef DECLS
 #undef DECL
@@ -383,20 +384,20 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));
-
-FNS(sse2);
+#define FNS(opt)                       \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))
+
+FNS(sse2)
 
 #undef FNS
 #undef FN
@@ -412,7 +413,7 @@ FNS(sse2);
   DECL(16, opt1)    \
   DECL(8, opt1)
 
-DECLS(sse2);
+DECLS(sse2)
 #undef DECL
 #undef DECLS
 
@@ -542,20 +543,66 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt1)                        \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
-  FN(8, 16, 8, 4, 3, opt1, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-FNS(sse2);
+#define FNS(opt1)                       \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int64_t))  \
+  FN(8, 16, 8, 4, 3, opt1, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (int64_t))
+
+FNS(sse2)
 
 #undef FNS
 #undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  if (width > 8) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+        const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+        const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+        const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+        _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+        _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+      const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+      _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+      comp_pred += 8 << 1;
+      pred += 8 << 1;
+      ref += ref_stride << 1;
+    }
+  } else {
+    assert(width == 4);
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+      const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+      _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+      comp_pred += 4 << 1;
+      pred += 4 << 1;
+      ref += ref_stride << 1;
+    }
+  }
+}
diff --git a/vpx_dsp/x86/inv_txfm_avx2.c b/vpx_dsp/x86/inv_txfm_avx2.c
new file mode 100644
index 0000000000..752435d240
--- /dev/null
+++ b/vpx_dsp/x86/inv_txfm_avx2.c
@@ -0,0 +1,626 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define PAIR256_SET_EPI16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in,
+                                  int stride) {
+  int i;
+  // Load 16x16 values
+  for (i = 0; i < 16; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride));
+    const __m128i in1 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 4));
+    const __m128i in2 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 8));
+    const __m128i in3 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 12));
+    const __m128i ls = _mm_packs_epi32(in0, in1);
+    const __m128i rs = _mm_packs_epi32(in2, in3);
+    in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1);
+#else
+    in[i] = _mm256_load_si256((const __m256i *)(input + i * stride));
+#endif
+  }
+}
+
+static INLINE __m256i dct_round_shift_avx2(__m256i in) {
+  const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING));
+  return _mm256_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) {
+  const __m256i t = _mm256_madd_epi16(*in, *cospi);
+  return dct_round_shift_avx2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1,
+                                             __m256i *x) {
+  const __m256i t0 = idct_madd_round_shift_avx2(in0, x);
+  const __m256i t1 = idct_madd_round_shift_avx2(in1, x);
+  return _mm256_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1,
+                               __m256i *out0, __m256i *out1) {
+  __m256i cst0 = PAIR256_SET_EPI16(c0, -c1);
+  __m256i cst1 = PAIR256_SET_EPI16(c1, c0);
+  __m256i lo = _mm256_unpacklo_epi16(in0, in1);
+  __m256i hi = _mm256_unpackhi_epi16(in0, in1);
+  *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0);
+  *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1);
+}
+
+static INLINE void idct16_16col(__m256i *in, __m256i *out) {
+  __m256i step1[16], step2[16];
+
+  // stage 2
+  butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+  step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm256_add_epi16(step2[10], step2[11]);
+  step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm256_add_epi16(step2[14], step2[15]);
+
+  // stage 4
+  butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+              &step2[14]);
+  butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+              &step2[10]);
+  step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+  step1[4] = _mm256_add_epi16(step1[4], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+  step1[7] = _mm256_add_epi16(step1[6], step1[7]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+  butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+              &step1[6]);
+  step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm256_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm256_add_epi16(step1[1], step1[6]);
+  step2[2] = _mm256_add_epi16(step1[2], step1[5]);
+  step2[3] = _mm256_add_epi16(step1[3], step1[4]);
+  step2[4] = _mm256_sub_epi16(step1[3], step1[4]);
+  step2[5] = _mm256_sub_epi16(step1[2], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[1], step1[6]);
+  step2[7] = _mm256_sub_epi16(step1[0], step1[7]);
+  butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+              &step2[13]);
+  butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+              &step2[12]);
+
+  // stage 7
+  out[0] = _mm256_add_epi16(step2[0], step1[15]);
+  out[1] = _mm256_add_epi16(step2[1], step1[14]);
+  out[2] = _mm256_add_epi16(step2[2], step2[13]);
+  out[3] = _mm256_add_epi16(step2[3], step2[12]);
+  out[4] = _mm256_add_epi16(step2[4], step2[11]);
+  out[5] = _mm256_add_epi16(step2[5], step2[10]);
+  out[6] = _mm256_add_epi16(step2[6], step1[9]);
+  out[7] = _mm256_add_epi16(step2[7], step1[8]);
+  out[8] = _mm256_sub_epi16(step2[7], step1[8]);
+  out[9] = _mm256_sub_epi16(step2[6], step1[9]);
+  out[10] = _mm256_sub_epi16(step2[5], step2[10]);
+  out[11] = _mm256_sub_epi16(step2[4], step2[11]);
+  out[12] = _mm256_sub_epi16(step2[3], step2[12]);
+  out[13] = _mm256_sub_epi16(step2[2], step2[13]);
+  out[14] = _mm256_sub_epi16(step2[1], step1[14]);
+  out[15] = _mm256_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) {
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest)));
+  d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+  d0 = _mm256_unpacklo_epi8(d0, zero);
+  d0 = _mm256_add_epi16(in_x, d0);
+  d0 = _mm256_packus_epi16(
+      d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1)));
+
+  _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0));
+}
+
+static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) {
+  const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+  __m256i out;
+  out = _mm256_adds_epi16(in, final_rounding);
+  out = _mm256_srai_epi16(out, 6);
+  recon_and_store16(dest, out);
+}
+
+static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) {
+  const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+  int j = 0;
+  while (j < 32) {
+    in[j] = _mm256_adds_epi16(in[j], final_rounding);
+    in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding);
+
+    in[j] = _mm256_srai_epi16(in[j], 6);
+    in[j + 1] = _mm256_srai_epi16(in[j + 1], 6);
+
+    recon_and_store16(dst, in[j]);
+    dst += stride;
+    recon_and_store16(dst, in[j + 1]);
+    dst += stride;
+    j += 2;
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) {
+  int i;
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + (idx)] =                                                             \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + (idx)] = _mm256_inserti128_si256(                                    \
+      t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int i;
+  __m256i in[16];
+
+  // Load 16x16 values
+  idct_load16x16(input, in, 16);
+
+  transpose_16bit_16x16_avx2(in, in);
+  idct16_16col(in, in);
+
+  transpose_16bit_16x16_avx2(in, in);
+  idct16_16col(in, in);
+
+  for (i = 0; i < 16; ++i) {
+    write_buffer_16x1(dest + i * stride, in[i]);
+  }
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm256_add_epi16(in[i], in[bound - i]);
+    out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) {
+  __m256i step1[8], step2[8];
+
+  // stage 3
+  butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+  // stage 4
+  butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm256_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm256_add_epi16(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+              &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm256_add_epi16(step1[0], step1[7]);
+  out[1] = _mm256_add_epi16(step1[1], step1[6]);
+  out[2] = _mm256_add_epi16(step1[2], step1[5]);
+  out[3] = _mm256_add_epi16(step1[3], step1[4]);
+  out[4] = _mm256_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm256_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm256_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm256_sub_epi16(step1[0], step1[7]);
+}
+
+static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1,
+                                                       __m256i *out) {
+  __m256i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+              &step2[14]);
+  butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+              &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10],
+              &out[13]);
+  butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11],
+              &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) {
+  __m256i step1[16], step2[16];
+
+  // stage 2
+  butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm256_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[14]);
+
+  idct32_16x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1,
+                                                         __m256i *out) {
+  __m256i step2[32];
+
+  // stage 4
+  step2[16] = _mm256_add_epi16(step1[16], step1[19]);
+  step2[17] = _mm256_add_epi16(step1[17], step1[18]);
+  step2[18] = _mm256_sub_epi16(step1[17], step1[18]);
+  step2[19] = _mm256_sub_epi16(step1[16], step1[19]);
+  step2[20] = _mm256_sub_epi16(step1[23], step1[20]);
+  step2[21] = _mm256_sub_epi16(step1[22], step1[21]);
+  step2[22] = _mm256_add_epi16(step1[22], step1[21]);
+  step2[23] = _mm256_add_epi16(step1[23], step1[20]);
+
+  step2[24] = _mm256_add_epi16(step1[24], step1[27]);
+  step2[25] = _mm256_add_epi16(step1[25], step1[26]);
+  step2[26] = _mm256_sub_epi16(step1[25], step1[26]);
+  step2[27] = _mm256_sub_epi16(step1[24], step1[27]);
+  step2[28] = _mm256_sub_epi16(step1[31], step1[28]);
+  step2[29] = _mm256_sub_epi16(step1[30], step1[29]);
+  step2[30] = _mm256_add_epi16(step1[29], step1[30]);
+  step2[31] = _mm256_add_epi16(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+              &step1[29]);
+  butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+              &step1[28]);
+  butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+              &step1[27]);
+  butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+              &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  out[16] = _mm256_add_epi16(step1[16], step1[23]);
+  out[17] = _mm256_add_epi16(step1[17], step1[22]);
+  out[18] = _mm256_add_epi16(step1[18], step1[21]);
+  out[19] = _mm256_add_epi16(step1[19], step1[20]);
+  step2[20] = _mm256_sub_epi16(step1[19], step1[20]);
+  step2[21] = _mm256_sub_epi16(step1[18], step1[21]);
+  step2[22] = _mm256_sub_epi16(step1[17], step1[22]);
+  step2[23] = _mm256_sub_epi16(step1[16], step1[23]);
+
+  step2[24] = _mm256_sub_epi16(step1[31], step1[24]);
+  step2[25] = _mm256_sub_epi16(step1[30], step1[25]);
+  step2[26] = _mm256_sub_epi16(step1[29], step1[26]);
+  step2[27] = _mm256_sub_epi16(step1[28], step1[27]);
+  out[28] = _mm256_add_epi16(step1[27], step1[28]);
+  out[29] = _mm256_add_epi16(step1[26], step1[29]);
+  out[30] = _mm256_add_epi16(step1[25], step1[30]);
+  out[31] = _mm256_add_epi16(step1[24], step1[31]);
+
+  // stage 7
+  butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20],
+              &out[27]);
+  butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21],
+              &out[26]);
+  butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22],
+              &out[25]);
+  butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23],
+              &out[24]);
+}
+
+static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) {
+  __m256i temp[16];
+
+  // For each 16x32 block __m256i in[32],
+  // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+  // output pixels: 0-7 in __m256i out[32]
+  idct32_1024_16x32_quarter_1(in, temp);
+
+  // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+  // output pixels: 8-15 in __m256i out[32]
+  idct32_1024_16x32_quarter_2(in, temp);
+
+  // stage 7
+  add_sub_butterfly_avx2(temp, out, 16);
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) {
+  __m256i step1[32], step2[32];
+
+  // stage 1
+  butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+  butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+  butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+  butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+  butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+  butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+  // stage 2
+  step2[16] = _mm256_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm256_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm256_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm256_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm256_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm256_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm256_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm256_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm256_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm256_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm256_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm256_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm256_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm256_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm256_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm256_add_epi16(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+              &step1[30]);
+  butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+              &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+              &step1[26]);
+  butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+              &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_16x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) {
+  __m256i temp[32];
+
+  // For each 16x32 block __m256i in[32],
+  // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+  // output pixels: 0-7 in __m256i out[32]
+  // AND
+  // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+  // output pixels: 8-15 in __m256i out[32]
+  idct32_1024_16x32_quarter_1_2(in, temp);
+
+  // For each 16x32 block __m256i in[32],
+  // Input with odd index,
+  // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+  // output pixels: 16-23, 24-31 in __m256i out[32]
+  idct32_1024_16x32_quarter_3_4(in, temp);
+
+  // final stage
+  add_sub_butterfly_avx2(temp, out, 32);
+}
+
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  __m256i l[32], r[32], out[32], *in;
+  int i;
+
+  in = l;
+
+  for (i = 0; i < 2; i++) {
+    idct_load16x16(input, in, 32);
+    transpose_16bit_16x16_avx2(in, in);
+
+    idct_load16x16(input + 16, in + 16, 32);
+    transpose_16bit_16x16_avx2(in + 16, in + 16);
+    idct32_1024_16x32(in, in);
+
+    in = r;
+    input += 32 << 4;
+  }
+
+  for (i = 0; i < 32; i += 16) {
+    transpose_16bit_16x16_avx2(l + i, out);
+    transpose_16bit_16x16_avx2(r + i, out + 16);
+    idct32_1024_16x32(out, out);
+
+    store_buffer_16x32(out, dest, stride);
+    dest += 16;
+  }
+}
+
+// Case when only upper-left 16x16 has non-zero coeff
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m256i in[32], io[32], out[32];
+  int i;
+
+  for (i = 16; i < 32; i++) {
+    in[i] = _mm256_setzero_si256();
+  }
+
+  // rows
+  idct_load16x16(input, in, 32);
+  transpose_16bit_16x16_avx2(in, in);
+  idct32_1024_16x32(in, io);
+
+  // columns
+  for (i = 0; i < 32; i += 16) {
+    transpose_16bit_16x16_avx2(io + i, in);
+    idct32_1024_16x32(in, out);
+
+    store_buffer_16x32(out, dest, stride);
+    dest += 16;
+  }
+}
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 4b02da9666..f42b3df849 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -243,7 +243,7 @@ void iadst8_sse2(__m128i *const in) {
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   __m128i s[8], u[16], v[8], w[16];
 
   // transpose
@@ -546,7 +546,7 @@ void vpx_iadst16_8col_sse2(__m128i *const in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
 
   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
index be391992af..a58fb65539 100644
--- a/vpx_dsp/x86/loopfilter_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c
@@ -18,7 +18,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
                                 const unsigned char *limit,
                                 const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
@@ -372,7 +372,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
                                      const unsigned char *limit,
                                      const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index b6ff24834b..6ea34cdd16 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -106,7 +106,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
 
 void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
                                const uint8_t *limit, const uint8_t *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i limit_v =
       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
                          _mm_loadl_epi64((const __m128i *)limit));
@@ -140,7 +140,7 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
 
 void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i limit_v =
       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
                          _mm_loadl_epi64((const __m128i *)limit));
@@ -211,28 +211,28 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
 
-  storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
 
-  storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
 }
 
 void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
                                 const unsigned char *blimit,
                                 const unsigned char *limit,
                                 const unsigned char *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -594,7 +594,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
                                      const unsigned char *blimit,
                                      const unsigned char *limit,
                                      const unsigned char *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -932,7 +932,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
   const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
@@ -1152,7 +1152,7 @@ void vpx_lpf_horizontal_8_dual_sse2(
   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit =
       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
                          _mm_load_si128((const __m128i *)blimit1));
@@ -1406,7 +1406,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
   const __m128i thresh =
       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
                          _mm_load_si128((const __m128i *)thresh1));
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i mask, hev, flat;
 
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 258ab38e60..031f361a41 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -16,24 +16,24 @@
 
 #include "./vpx_config.h"
 
-static INLINE void storeu_uint32(void *dst, uint32_t v) {
+static INLINE void storeu_int32(void *dst, int32_t v) {
   memcpy(dst, &v, sizeof(v));
 }
 
-static INLINE uint32_t loadu_uint32(const void *src) {
-  uint32_t v;
+static INLINE int32_t loadu_int32(const void *src) {
+  int32_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
 
 static INLINE __m128i load_unaligned_u32(const void *a) {
-  uint32_t val;
+  int val;
   memcpy(&val, a, sizeof(val));
   return _mm_cvtsi32_si128(val);
 }
 
 static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
-  const uint32_t val = _mm_cvtsi128_si32(v);
+  const int val = _mm_cvtsi128_si32(v);
   memcpy(a, &val, sizeof(val));
 }
 
diff --git a/vpx_dsp/x86/post_proc_sse2.c b/vpx_dsp/x86/post_proc_sse2.c
index d1029afc4f..119fa7cd1a 100644
--- a/vpx_dsp/x86/post_proc_sse2.c
+++ b/vpx_dsp/x86/post_proc_sse2.c
@@ -36,7 +36,7 @@ void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
     __m128i s = _mm_loadl_epi64((__m128i *)dst);
     __m128i sum, sumsq_0, sumsq_1;
     __m128i tmp_0, tmp_1;
-    __m128i below_context;
+    __m128i below_context = _mm_setzero_si128();
 
     s = _mm_unpacklo_epi8(s, zero);
 
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 0a91d36eaf..5ff5abc110 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -19,17 +19,18 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *zbin_ptr,
-                        const int16_t *round_ptr, const int16_t *quant_ptr,
-                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan) {
+                        const struct macroblock_plane *const mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -38,14 +39,9 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
-
   *eob_ptr = 0;
 
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -95,8 +91,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -136,26 +131,22 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t *zbin_ptr,
-                              const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -164,29 +155,8 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-  (void)n_coeffs;
-  (void)skip_block;
-  assert(!skip_block);
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  shift = _mm_slli_epi16(shift, 1);
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -234,8 +204,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
 
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -277,8 +246,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
                                       dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
new file mode 100644
index 0000000000..d4872f6bca
--- /dev/null
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -0,0 +1,290 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void load_b_values_avx2(
+    const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant,
+    __m256i *shift, int log_scale) {
+  *zbin =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin));
+  *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *zbin = _mm256_add_epi16(*zbin, rnd);
+    *zbin = _mm256_srai_epi16(*zbin, log_scale);
+  }
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+  *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+  *round =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *round = _mm256_add_epi16(*round, rnd);
+    *round = _mm256_srai_epi16(*round, log_scale);
+  }
+
+  *quant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+  *shift = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->quant_shift));
+  *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i
+load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // typedef int32_t tran_low_t;
+  const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+#else
+  // typedef int16_t tran_low_t;
+  return _mm256_loadu_si256((const __m256i *)coeff_ptr);
+#endif
+}
+
+static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                                     tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // typedef int32_t tran_low_t;
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+  _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+#else
+  // typedef int16_t tran_low_t;
+  _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals);
+#endif
+}
+
+static VPX_FORCE_INLINE __m256i
+quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+              tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+              __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return _mm256_setzero_si256();
+  }
+  {
+    // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+    const __m256i v_tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+
+    const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+    const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+    const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+    const __m256i v_nz_mask =
+        _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+    const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+#if CONFIG_VP9_HIGHBITDEPTH
+    const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+    const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant);
+
+    const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high);
+    const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high);
+#else
+    const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+#endif
+
+    store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+    store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+#endif
+    return v_nz_mask;
+  }
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i v_iscan = _mm256_permute4x64_epi64(
+      _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+  __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+  __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
+  __m256i v_eobmax = _mm256_setzero_si256();
+  intptr_t count;
+  const int16_t *iscan = scan_order->iscan;
+
+  load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+                     &v_dequant, &v_quant_shift, 0);
+  // Do DC and first 15 AC.
+  v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                            &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+  v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (count = n_coeffs - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                              &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+    v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant,
+    __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin,
+    __m256i *v_quant_shift, __m256i *v_eobmax) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif
+    return *v_eobmax;
+  }
+  {
+    // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0
+    const __m256i v_tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+    //  tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+    //                 quant_shift_ptr[rc != 0]) >> 15);
+    const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+    const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+    const __m256i v_tmp32_hi =
+        _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1);
+    const __m256i v_tmp32_lo =
+        _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15);
+    const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+    const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+    const __m256i v_sign_lo =
+        _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff);
+    const __m256i v_sign_hi =
+        _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff);
+    const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant);
+    const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant);
+    const __m256i v_dqcoeff_lo = _mm256_sign_epi32(
+        _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo);
+    const __m256i v_dqcoeff_hi = _mm256_sign_epi32(
+        _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi);
+    const __m256i v_nz_mask =
+        _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+
+    store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+    store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi),
+                            dqcoeff_ptr);
+#endif
+
+    return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask);
+  }
+}
+
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const struct ScanOrder *const scan_order) {
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+  __m256i v_eobmax = _mm256_setzero_si256();
+  intptr_t count;
+  const int16_t *iscan = scan_order->iscan;
+
+  load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+                     &v_dequant, &v_quant_shift, 1);
+
+  // Do DC and first 15 AC.
+  v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+                                 &v_quant, &v_dequant, &v_round, &v_zbin,
+                                 &v_quant_shift, &v_eobmax);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (count = (32 * 32) - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+                                   &v_quant, &v_dequant, &v_round, &v_zbin,
+                                   &v_quant_shift, &v_eobmax);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index e38a4059ab..64838eaa7d 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -16,16 +16,16 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
@@ -33,13 +33,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
-
   // Setup global values.
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -78,7 +73,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dequant = _mm_unpackhi_epi64(dequant, dequant);
   calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -108,8 +103,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index afe2f924b3..82c755a0cf 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,18 +15,54 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
 
-static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
-                                 const int16_t *round_ptr, __m128i *round,
-                                 const int16_t *quant_ptr, __m128i *quant,
+static INLINE void load_b_values(const struct macroblock_plane *const mb_plane,
+                                 __m128i *zbin, __m128i *round, __m128i *quant,
                                  const int16_t *dequant_ptr, __m128i *dequant,
-                                 const int16_t *shift_ptr, __m128i *shift) {
-  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  *round = _mm_load_si128((const __m128i *)round_ptr);
-  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+                                 __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
   *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
   *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+}
+
+static INLINE void load_b_values32x32(
+    const struct macroblock_plane *const mb_plane, __m128i *zbin,
+    __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+    __m128i *dequant, __m128i *shift) {
+  const __m128i one = _mm_set1_epi16(1);
+  // The 32x32 halves zbin and round.
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  // Shift with rounding.
+  *zbin = _mm_add_epi16(*zbin, one);
+  *zbin = _mm_srli_epi16(*zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  *zbin = _mm_sub_epi16(*zbin, one);
+
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *round = _mm_add_epi16(*round, one);
+  *round = _mm_srli_epi16(*round, 1);
+
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  *shift = _mm_slli_epi16(*shift, 1);
+}
+
+static INLINE void load_fp_values(const struct macroblock_plane *mb_plane,
+                                  __m128i *round, __m128i *quant,
+                                  const int16_t *dequant_ptr,
+                                  __m128i *dequant) {
+  *round = _mm_load_si128((const __m128i *)mb_plane->round_fp);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
 }
 
 // With ssse3 and later abs() and sign() are preferred.
@@ -62,11 +98,8 @@ static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to
-// zbin to add 1 to the index in 'scan'.
+// Scan 16 values for eob reference in scan.
 static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
-                                   const __m128i zbin_mask0,
-                                   const __m128i zbin_mask1,
                                    const int16_t *scan, const int index,
                                    const __m128i zero) {
   const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
@@ -74,9 +107,6 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
   __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
   __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
   __m128i eob0, eob1;
-  // Add one to convert from indices to counts
-  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
-  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
   eob0 = _mm_andnot_si128(zero_coeff0, scan0);
   eob1 = _mm_andnot_si128(zero_coeff1, scan1);
   return _mm_max_epi16(eob0, eob1);
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index fc1d91959f..2c6d851a16 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,16 +16,17 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr,
+                          const struct macroblock_plane *const mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -33,12 +34,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
-
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -72,7 +68,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dequant = _mm_unpackhi_epi64(dequant, dequant);
   calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -100,8 +96,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
@@ -110,17 +105,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                int skip_block, const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -129,32 +121,8 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-  (void)n_coeffs;
-  (void)skip_block;
-  assert(!skip_block);
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  // I suspect this is not technically OK because quant_shift can be up
-  // to 1 << 16 and shifting up again will outrange that, but the test is not
-  // comprehensive enough to catch that and "it's been that way forever"
-  shift = _mm_slli_epi16(shift, 1);
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -206,8 +174,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
 
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -253,8 +220,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
                                       dqcoeff_ptr + 8 + index);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index a5c4f8c537..cf7111983b 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -11,8 +11,12 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
+// Note with sums[4] some versions of Visual Studio may fail due to parameter
+// alignment, though the functions should be equivalent:
+// error C2719: 'sums': formal parameter with requested alignment of 32 won't be
+// aligned
 static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
-                                uint32_t *sad_array) {
+                                uint32_t sad_array[4]) {
   const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
   const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
   const __m256i t2 = _mm256_hadd_epi32(t0, t1);
@@ -21,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
   _mm_storeu_si128((__m128i *)sad_array, sum);
 }
 
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
   int i;
   const uint8_t *refs[4];
   __m256i sums[4];
@@ -37,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums[2] = _mm256_setzero_si256();
   sums[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 32; i++) {
+  for (i = 0; i < h; i++) {
     __m256i r[4];
 
     // load src and all ref[]
@@ -69,66 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums, sad_array);
 }
 
-void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride,
-                         uint32_t *sad_array) {
-  int i;
-  __m256i sums[8];
-
-  sums[0] = _mm256_setzero_si256();
-  sums[1] = _mm256_setzero_si256();
-  sums[2] = _mm256_setzero_si256();
-  sums[3] = _mm256_setzero_si256();
-  sums[4] = _mm256_setzero_si256();
-  sums[5] = _mm256_setzero_si256();
-  sums[6] = _mm256_setzero_si256();
-  sums[7] = _mm256_setzero_si256();
-
-  for (i = 0; i < 32; i++) {
-    __m256i r[8];
-
-    // load src and all ref[]
-    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
-    r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]);
-    r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]);
-    r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]);
-    r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]);
-    r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]);
-    r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]);
-    r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]);
-    r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]);
-
-    // sum of the absolute differences between every ref[] to src
-    r[0] = _mm256_sad_epu8(r[0], s);
-    r[1] = _mm256_sad_epu8(r[1], s);
-    r[2] = _mm256_sad_epu8(r[2], s);
-    r[3] = _mm256_sad_epu8(r[3], s);
-    r[4] = _mm256_sad_epu8(r[4], s);
-    r[5] = _mm256_sad_epu8(r[5], s);
-    r[6] = _mm256_sad_epu8(r[6], s);
-    r[7] = _mm256_sad_epu8(r[7], s);
-
-    // sum every ref[]
-    sums[0] = _mm256_add_epi32(sums[0], r[0]);
-    sums[1] = _mm256_add_epi32(sums[1], r[1]);
-    sums[2] = _mm256_add_epi32(sums[2], r[2]);
-    sums[3] = _mm256_add_epi32(sums[3], r[3]);
-    sums[4] = _mm256_add_epi32(sums[4], r[4]);
-    sums[5] = _mm256_add_epi32(sums[5], r[5]);
-    sums[6] = _mm256_add_epi32(sums[6], r[6]);
-    sums[7] = _mm256_add_epi32(sums[7], r[7]);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  calc_final_4(sums, sad_array);
-  calc_final_4(sums + 4, sad_array + 4);
-}
-
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
   __m256i sums[4];
   int i;
   const uint8_t *refs[4];
@@ -142,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums[2] = _mm256_setzero_si256();
   sums[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 64; i++) {
+  for (i = 0; i < h; i++) {
     __m256i r_lo[4], r_hi[4];
     // load 64 bytes from src and all ref[]
     const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
@@ -185,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
 
   calc_final_4(sums, sad_array);
 }
+
+#define SAD64_H(h)                                                         \
+  void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[4],          \
+                               int ref_stride, uint32_t sad_array[4]) {    \
+    sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+  }
+
+#define SAD32_H(h)                                                         \
+  void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[4],          \
+                               int ref_stride, uint32_t sad_array[4]) {    \
+    sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+  }
+
+SAD64_H(64)
+SAD32_H(32)
+
+#define SADS64_H(h)                                                           \
+  void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
+                    ((h) >> 1), sad_array);                                   \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
+  }
+
+#define SADS32_H(h)                                                           \
+  void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
+                    ((h) >> 1), sad_array);                                   \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
+  }
+
+SADS64_H(64)
+SADS64_H(32)
+
+SADS32_H(64)
+SADS32_H(32)
+SADS32_H(16)
diff --git a/vpx_dsp/x86/sad4d_avx512.c b/vpx_dsp/x86/sad4d_avx512.c
index 4c5d70464d..cfd23fedd9 100644
--- a/vpx_dsp/x86/sad4d_avx512.c
+++ b/vpx_dsp/x86/sad4d_avx512.c
@@ -13,7 +13,7 @@
 
 void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
                             const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t res[4]) {
+                            uint32_t sad_array[4]) {
   __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
   __m512i sum_mlow, sum_mhigh;
@@ -78,6 +78,6 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
     sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
                            _mm256_extractf128_si256(sum256, 1));
 
-    _mm_storeu_si128((__m128i *)(res), sum128);
+    _mm_storeu_si128((__m128i *)(sad_array), sum128);
   }
 }
diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm
index 3f6e55ce9a..ed4ea3ef9b 100644
--- a/vpx_dsp/x86/sad4d_sse2.asm
+++ b/vpx_dsp/x86/sad4d_sse2.asm
@@ -179,13 +179,27 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
+%macro SADNXN4D 2-3 0
+%if %3 == 1  ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+%else  ; normal sad
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
 %else
 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
+%endif
+%endif
+%if %3 == 1
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
 %endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
@@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   mov                ref1q, [ref1q+gprsize*0]
 
   PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 1  ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
 
 %if %1 > 4
@@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   punpckhqdq            m5, m7
   movifnidn             r4, r4mp
   paddd                 m4, m5
+%if %3 == 1
+  pslld                 m4, 1
+%endif
   movu                [r4], m4
   RET
 %else
   movifnidn             r4, r4mp
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
+%if %3 == 1
+  pslld                 m6, 1
+  pslld                 m7, 1
+%endif
   movq              [r4+0], m6
   movq              [r4+8], m7
   RET
@@ -237,3 +264,15 @@ SADNXN4D  8,  8
 SADNXN4D  8,  4
 SADNXN4D  4,  8
 SADNXN4D  4,  4
+
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16,  8, 1
+SADNXN4D  8, 16, 1
+SADNXN4D  8,  8, 1
+SADNXN4D  4,  8, 1
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index d944134305..e00494d766 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -11,88 +11,120 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  const int ref2_stride = ref_stride << 1;
+  const int src2_stride = src_stride << 1;
+  const int max = h >> 1;
+  for (i = 0; i < max; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref2_stride;
+    src_ptr += src2_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
 #define FSAD64_H(h)                                                           \
   unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS64_H(h)                                                          \
+  unsigned int vpx_sad_skip_64x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD32_H(h)                                                           \
   unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS32_H(h)                                                          \
+  unsigned int vpx_sad_skip_32x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD64  \
-  FSAD64_H(64); \
-  FSAD64_H(32);
+  FSAD64_H(64)  \
+  FSAD64_H(32)  \
+  FSADS64_H(64) \
+  FSADS64_H(32)
 
 #define FSAD32  \
-  FSAD32_H(64); \
-  FSAD32_H(32); \
-  FSAD32_H(16);
+  FSAD32_H(64)  \
+  FSAD32_H(32)  \
+  FSAD32_H(16)  \
+  FSADS32_H(64) \
+  FSADS32_H(32) \
+  FSADS32_H(16)
 
-FSAD64;
-FSAD32;
+FSAD64
+FSAD32
 
 #undef FSAD64
 #undef FSAD32
 #undef FSAD64_H
 #undef FSAD32_H
+#undef FSADS64_H
+#undef FSADS32_H
 
 #define FSADAVG64_H(h)                                                        \
   unsigned int vpx_sad64x##h##_avg_avx2(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -118,15 +150,14 @@ FSAD32;
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
 #define FSADAVG32_H(h)                                                        \
   unsigned int vpx_sad32x##h##_avg_avx2(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -156,21 +187,20 @@ FSAD32;
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
-#define FSADAVG64  \
-  FSADAVG64_H(64); \
-  FSADAVG64_H(32);
+#define FSADAVG64 \
+  FSADAVG64_H(64) \
+  FSADAVG64_H(32)
 
-#define FSADAVG32  \
-  FSADAVG32_H(64); \
-  FSADAVG32_H(32); \
-  FSADAVG32_H(16);
+#define FSADAVG32 \
+  FSADAVG32_H(64) \
+  FSADAVG32_H(32) \
+  FSADAVG32_H(16)
 
-FSADAVG64;
-FSADAVG32;
+FSADAVG64
+FSADAVG32
 
 #undef FSADAVG64
 #undef FSADAVG32
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index e4e1bc3e98..627e463bf8 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -12,15 +12,29 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro SAD_FN 4
-%if %4 == 0
+%if %4 == 0 ; normal sad
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
 %if %3 == 5
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea           src_strided, [src_strided*2]
+lea           ref_strided, [ref_strided*2]
+%endif ; %4 skip
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
   SAD_FN 64, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN  64, 2  ; sad64x64_skip_sse2
+SAD64XN  32, 2  ; sad64x32_skip_sse2
 
 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD32XN 1-2 0
   SAD_FN 32, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
 
 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1-2 0
   SAD_FN 16, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -177,12 +221,19 @@ SAD16XN  8 ; sad16x8_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN  8, 2 ; sad16x8_skip_sse2
 
 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD8XN 1-2 0
   SAD_FN 8, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -210,6 +261,9 @@ SAD16XN  8, 1 ; sad16x8_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -221,12 +275,18 @@ SAD8XN  4 ; sad8x4_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN  8, 2 ; sad8x8_skip_sse2
 
 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -257,6 +317,9 @@ SAD8XN  4, 1 ; sad8x4_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -266,3 +329,4 @@ SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
+SAD4XN  8, 2 ; sad4x8_skip_sse
diff --git a/vpx_dsp/x86/sad_sse3.asm b/vpx_dsp/x86/sad_sse3.asm
deleted file mode 100644
index acbd2e4fa1..0000000000
--- a/vpx_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,376 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x16x3_sse3)
-sym(vpx_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x8x3_sse3)
-sym(vpx_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad8x16x3_sse3)
-sym(vpx_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad8x8x3_sse3)
-sym(vpx_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad4x4x3_sse3)
-sym(vpx_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
diff --git a/vpx_dsp/x86/sad_sse4.asm b/vpx_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 0818ed5f0a..0000000000
--- a/vpx_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,361 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
-    mov             rdi,        arg(4)           ;Results
-    pxor            xmm0, xmm0
-    movdqa          xmm2, xmm1
-    punpcklwd       xmm1, xmm0
-    punpckhwd       xmm2, xmm0
-
-    movdqa          [rdi],    xmm1
-    movdqa          [rdi + 16],    xmm2
-%endmacro
-
-SECTION .text
-
-;void vpx_sad16x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-globalsym(vpx_sad16x16x8_sse4_1)
-sym(vpx_sad16x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad16x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad16x8x8_sse4_1)
-sym(vpx_sad16x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad8x8x8_sse4_1)
-sym(vpx_sad8x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad8x16x8_sse4_1)
-sym(vpx_sad8x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad4x4x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad4x4x8_sse4_1)
-sym(vpx_sad4x4x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_4X2X8 1
-    PROCESS_4X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
diff --git a/vpx_dsp/x86/sad_ssse3.asm b/vpx_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index a5bc6d7306..0000000000
--- a/vpx_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x16x3_ssse3)
-sym(vpx_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x16x3_ssse3_skiptable
-.vpx_sad16x16x3_ssse3_jumptable:
-        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_skiptable:
-
-        call .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
-
-.vpx_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int vpx_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x8x3_ssse3)
-sym(vpx_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x8x3_ssse3_skiptable
-.vpx_sad16x8x3_ssse3_jumptable:
-        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_skiptable:
-
-        call .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
-
-.vpx_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c
new file mode 100644
index 0000000000..dfe45b6115
--- /dev/null
+++ b/vpx_dsp/x86/sse_avx2.c
@@ -0,0 +1,368 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b);
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
+  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+  int64_t sum;
+  __m256i zero = _mm256_setzero_si256();
+  const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+  const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+  const __m256i sum0_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
+  const __m256i sum1_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+  int64_t sum;
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+#endif
+
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = load_unaligned_u32(a);
+  const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+  const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2);
+  const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3);
+  const __m128i v_b0 = load_unaligned_u32(b);
+  const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+  const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2);
+  const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3);
+  const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+                                             _mm_unpacklo_epi32(v_a2, v_a3));
+  const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+                                             _mm_unpacklo_epi32(v_b2, v_b3));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  __m256i sum = _mm256_setzero_si256();
+  __m256i zero = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+        const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride));
+        const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+        const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride));
+        const __m256i v_a =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+        const __m256i v_b =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+        const __m256i temp =
+            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+                             _mm256_madd_epi16(v_bsub, v_bsub));
+        sum = _mm256_add_epi32(sum, temp);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default:
+      if ((width & 0x07) == 0) {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride << 1;
+          b += b_stride << 1;
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            const uint8_t *a2;
+            const uint8_t *b2;
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            a2 = a + i + (a_stride << 1);
+            b2 = b + i + (b_stride << 1);
+            sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      }
+      sse = summary_all_avx2(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+                                       const uint16_t *b) {
+  const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2));
+  const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2));
+  const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3));
+  const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3);
+  const __m256i v_a_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+  const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3);
+  const __m256i v_b_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride));
+  const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a);
+  const __m256i v_a_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+  const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride));
+  const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b);
+  const __m256i v_b_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          int i = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            const uint16_t *a2;
+            const uint16_t *b2;
+            highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+            a2 = a + i + (a_stride << 1);
+            b2 = b + i + (b_stride << 1);
+            highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+          summary_32_avx2(&sum32, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+              i += 8;
+            } while (i < width);
+            a += a_stride << 1;
+            b += b_stride << 1;
+            l += 2;
+          } while (l < 8 && l < (height - y));
+          summary_32_avx2(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      sse = summary_4x64_avx2(sum);
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c
new file mode 100644
index 0000000000..4a7585c57e
--- /dev/null
+++ b/vpx_dsp/x86/sse_sse4.c
@@ -0,0 +1,312 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+  int64_t sum;
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+  *sum64 = _mm_add_epi64(sum0, *sum64);
+  *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+                                  const uint8_t *b) {
+  const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+  const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m128i *sum) {
+  const __m128i v_a0 = load_unaligned_u32(a);
+  const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+  const __m128i v_b0 = load_unaligned_u32(b);
+  const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+  const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+                               __m128i *sum) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  int y = 0;
+  int64_t sse = 0;
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        sse8_sse4_1(a, b, &sum);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default:
+      if (width & 0x07) {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride;
+          b += b_stride;
+          y += 1;
+        } while (y < height);
+      }
+      sse = summary_all_sse4(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+                                          int a_stride, const uint16_t *b,
+                                          int b_stride) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+                                        const uint16_t *b) {
+  const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a);
+  const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int width,
+                              int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 16;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          __m128i sum32 = _mm_setzero_si128();
+          int i = 0;
+          do {
+            highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+            highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+          summary_32_sse4(&sum32, &sum);
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m128i sum32 = _mm_setzero_si128();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+              i += 8;
+            } while (i < width);
+            a += a_stride;
+            b += b_stride;
+            l += 1;
+          } while (l < 8 && l < (height - y));
+          summary_32_sse4(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/subtract_avx2.c b/vpx_dsp/x86/subtract_avx2.c
new file mode 100644
index 0000000000..4849581ed4
--- /dev/null
+++ b/vpx_dsp/x86/subtract_avx2.c
@@ -0,0 +1,203 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr,
+                                             const uint8_t *src_ptr,
+                                             const uint8_t *pred_ptr) {
+  const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr);
+  const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+  const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+  const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+  const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+  const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+  _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+  _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static VPX_FORCE_INLINE void subtract_block_16xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr);
+    const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr);
+    const __m256i s_0 = _mm256_cvtepu8_epi16(s);
+    const __m256i p_0 = _mm256_cvtepu8_epi16(p);
+    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+    _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void subtract_block_32xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void subtract_block_64xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride) {
+  switch (cols) {
+    case 16:
+      subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 32:
+      subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 64:
+      subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    default:
+      vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+                              src_stride, pred_ptr, pred_stride);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                                    ptrdiff_t diff_stride,
+                                    const uint8_t *src8_ptr,
+                                    ptrdiff_t src_stride,
+                                    const uint8_t *pred8_ptr,
+                                    ptrdiff_t pred_stride, int bd) {
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr);
+  (void)bd;
+  if (cols == 64) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+      const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32));
+      const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+      const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32));
+      const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      const __m256i d2 = _mm256_sub_epi16(s2, p2);
+      const __m256i d3 = _mm256_sub_epi16(s3, p3);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3);
+      src_ptr += src_stride;
+      pred_ptr += pred_stride;
+      diff_ptr += diff_stride;
+    } while (--j != 0);
+  } else if (cols == 32) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+      src_ptr += src_stride;
+      pred_ptr += pred_stride;
+      diff_ptr += diff_stride;
+    } while (--j != 0);
+  } else if (cols == 16) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 =
+          _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 =
+          _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  } else if (cols == 8) {
+    int j = rows;
+    do {
+      const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr);
+      const __m128i s1 =
+          _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride));
+      const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr);
+      const __m128i p1 =
+          _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride));
+      const __m128i d0 = _mm_sub_epi16(s0, p0);
+      const __m128i d1 = _mm_sub_epi16(s1, p1);
+      _mm_storeu_si128((__m128i *)diff_ptr, d0);
+      _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  } else {
+    int j = rows;
+    assert(cols == 4);
+    do {
+      const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr);
+      const __m128i s1 =
+          _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+      const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr);
+      const __m128i p1 =
+          _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride));
+      const __m128i d0 = _mm_sub_epi16(s0, p0);
+      const __m128i d1 = _mm_sub_epi16(s1, p1);
+      _mm_storel_epi64((__m128i *)diff_ptr, d0);
+      _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm
index 4273efb854..e3055ab292 100644
--- a/vpx_dsp/x86/subtract_sse2.asm
+++ b/vpx_dsp/x86/subtract_sse2.asm
@@ -124,4 +124,5 @@ INIT_MMX
   lea                predq, [predq+pred_str*2]
   sub                rowsd, 2
   jg .loop_4
+  emms
   RET
diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c
index 14f3b35c01..df6514b2c4 100644
--- a/vpx_dsp/x86/sum_squares_sse2.c
+++ b/vpx_dsp/x86/sum_squares_sse2.c
@@ -33,7 +33,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
   } else {
     // Generic case
     int r = size;
-    const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+    const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1);
     __m128i v_acc_q = _mm_setzero_si128();
 
     assert(size % 8 == 0);
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 9232acbfbb..8305b9f20f 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -98,6 +98,41 @@ static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
   return _mm256_add_epi32(sum_lo, sum_hi);
 }
 
+static INLINE void variance8_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  __m128i src0, src1, ref0, ref1;
+  __m256i ss, rr, diff;
+
+  // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00
+  src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride));
+
+  // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10
+  src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride));
+
+  // s17 s16...s11 s10 s07 s06...s01 s00 (8bit)
+  src0 = _mm_unpacklo_epi64(src0, src1);
+
+  // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit)
+  ss = _mm256_cvtepu8_epi16(src0);
+
+  // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00
+  ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride));
+
+  // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10
+  ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride));
+
+  // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit)
+  ref0 = _mm_unpacklo_epi64(ref0, ref1);
+
+  // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit)
+  rr = _mm256_cvtepu8_epi16(ref0);
+
+  diff = _mm256_sub_epi16(ss, rr);
+  *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff));
+  *sum = _mm256_add_epi16(*sum, diff);
+}
+
 static INLINE void variance16_kernel_avx2(
     const uint8_t *const src, const int src_stride, const uint8_t *const ref,
     const int ref_stride, __m256i *const sse, __m256i *const sum) {
@@ -119,6 +154,21 @@ static INLINE void variance32_kernel_avx2(const uint8_t *const src,
   variance_kernel_avx2(s, r, sse, sum);
 }
 
+static INLINE void variance8_avx2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m256i *const vsse,
+                                  __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
+  *vsse = _mm256_setzero_si256();
+
+  for (i = 0; i < h; i += 2) {
+    variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
+}
+
 static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
                                    const uint8_t *ref, const int ref_stride,
                                    const int h, __m256i *const vsse,
@@ -590,17 +640,20 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
   return sum;
 }
 
-static unsigned int sub_pixel_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, int height, unsigned int *sse) {
+static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                       int x_offset, int y_offset,
+                                       const uint8_t *dst, int dst_stride,
+                                       int height, unsigned int *sse) {
   return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
                          NULL, 0, 0, height, sse);
 }
 
-static unsigned int sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *second_pred,
-    int second_stride, int height, unsigned int *sse) {
+static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                           int x_offset, int y_offset,
+                                           const uint8_t *dst, int dst_stride,
+                                           const uint8_t *second_pred,
+                                           int second_stride, int height,
+                                           unsigned int *sse) {
   return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
                          second_pred, second_stride, 1, height, sse);
 }
@@ -609,6 +662,36 @@ typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sse, int *sum);
 
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 7);
+}
+
 unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride,
                                    unsigned int *sse) {
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 37ef64ecaa..d6eb12da1a 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -19,7 +19,7 @@
 static INLINE unsigned int add32x4_sse2(__m128i val) {
   val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
   val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
-  return _mm_cvtsi128_si32(val);
+  return (unsigned int)_mm_cvtsi128_si32(val);
 }
 
 unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
@@ -36,8 +36,8 @@ unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
 }
 
 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
   const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
   return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
 }
@@ -85,7 +85,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_unpacklo_epi16(vsum, vsum);
   vsum = _mm_srai_epi32(vsum, 16);
-  *sum = add32x4_sse2(vsum);
+  *sum = (int)add32x4_sse2(vsum);
 }
 
 static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
@@ -97,7 +97,7 @@ static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
 // Can handle 1024 pixels' diff sum (such as 32x32)
 static INLINE int sum_final_sse2(const __m128i sum) {
   const __m128i t = sum_to_32bit_sse2(sum);
-  return add32x4_sse2(t);
+  return (int)add32x4_sse2(t);
 }
 
 static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
@@ -349,7 +349,7 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
   }
   *sse = add32x4_sse2(vsse);
-  sum = add32x4_sse2(vsum);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
@@ -369,7 +369,7 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
   }
   *sse = add32x4_sse2(vsse);
-  sum = add32x4_sse2(vsum);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
@@ -389,7 +389,7 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
   }
   *sse = add32x4_sse2(vsse);
-  sum = add32x4_sse2(vsum);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
 }
 
@@ -471,23 +471,23 @@ DECLS(ssse3, ssse3);
            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t))     \
   FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
 
-FNS(sse2, sse2);
-FNS(ssse3, ssse3);
+FNS(sse2, sse2)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
@@ -543,23 +543,23 @@ DECLS(ssse3, ssse3);
            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t))  \
+  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t))    \
+  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t))    \
   FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
 
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
+FNS(sse2, sse)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index 2391790284..21a35ae3c3 100644
--- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -485,7 +485,7 @@ static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
     // Saturate and convert to 8-bit words
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
 
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
 
     src_ptr += src_stride;
     dst_ptr += dst_stride;
@@ -589,8 +589,8 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
     res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
 
     // Save only half of the register (8 words)
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
-    *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
+    *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
 
     // Update the source by two rows
     src_ptr += src_stride_unrolled;
@@ -1040,12 +1040,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
 //                                  const InterpKernel *filter, int x0_q4,
 //                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
-            sse2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+            sse2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
+            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
 
 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
@@ -1057,8 +1057,8 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
 //                             const InterpKernel *filter, int x0_q4,
 //                             int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                             int w, int h);
-FUN_CONV_2D(, sse2, 0);
-FUN_CONV_2D(avg_, sse2, 1);
+FUN_CONV_2D(, sse2, 0)
+FUN_CONV_2D(avg_, sse2, 1)
 
 #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
@@ -1139,12 +1139,12 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
 //                                         const int16_t *filter_y,
 //                                         int y_step_q4,
 //                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), , sse2, 0);
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+                 src - src_stride * (num_taps / 2 - 1), , sse2, 0)
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
+                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
 
 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -1156,6 +1156,6 @@ HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
 //                                    const InterpKernel *filter, int x0_q4,
 //                                    int32_t x_step_q4, int y0_q4,
 //                                    int y_step_q4, int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2, 0);
-HIGH_FUN_CONV_2D(avg_, sse2, 1);
+HIGH_FUN_CONV_2D(, sse2, 0)
+HIGH_FUN_CONV_2D(avg_, sse2, 1)
 #endif  // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 1eaa19bfc5..526c283823 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/x86/convolve.h"
 #include "vpx_dsp/x86/convolve_avx2.h"
 #include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
 #include "vpx_ports/mem.h"
 
 // filters for 16_h8
@@ -38,6 +39,27 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
+  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
+  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
+  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+#define CALC_CONVOLVE8_HORZ_ROW                                               \
+  srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);          \
+  s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]);                               \
+  s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]);                               \
+  s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]);                               \
+  s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]);                               \
+  s1[0] = convolve8_16_avx2(s1, f1);                                          \
+  s1[0] = _mm256_packus_epi16(s1[0], s1[0]);                                  \
+  src_ptr += src_stride;                                                      \
+  _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \
+  output_ptr += output_pitch;                                                 \
+  _mm_storel_epi64((__m128i *)&output_ptr[0],                                 \
+                   _mm256_extractf128_si256(s1[0], 1));                       \
+  output_ptr += output_pitch;
+
 static INLINE void vpx_filter_block1d16_h8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
@@ -61,12 +83,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
     __m256i srcReg;
 
     // load the 2 strides of source
-    srcReg =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg = _mm256_inserti128_si256(
-        srcReg,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
-        1);
+    srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3);
 
     // filter the source buffer
     s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -77,12 +94,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg = _mm256_inserti128_si256(
-        srcReg,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
-        1);
+    srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5);
 
     // filter the source buffer
     s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -97,60 +109,37 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
 
     src_ptr += src_stride;
 
-    // average if necessary
-    outReg1 = _mm256_castsi256_si128(outReg32b1);
-    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
     if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-      outReg2 = _mm_avg_epu8(
-          outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch));
+      outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg);
     }
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
-
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + output_pitch), &outReg32b1);
     output_ptr += dst_stride;
   }
 
   // if the number of strides is odd.
   // process only 16 bytes
   if (i > 0) {
-    __m128i srcReg;
-
-    // load the first 16 bytes of the last row
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    const __m256i srcReg =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1);
 
     // filter the source buffer
-    s[0] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
-    s[1] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
-    s[2] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
-    s[3] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
-    outReg1 = convolve8_8_avx2(s, f);
-
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
 
-    // filter the source buffer
-    s[0] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
-    s[1] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
-    s[2] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
-    s[3] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
-    outReg2 = convolve8_8_avx2(s, f);
+    // The low and high 128-bits of each lane contain the first and second
+    // convolve result respectively
+    outReg32b1 = convolve8_16_avx2(s, f);
+    outReg1 = _mm256_castsi256_si128(outReg32b1);
+    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
 
-    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
-    // contain the first and second convolve result respectively
+    // shrink to 8 bit each 16 bits
     outReg1 = _mm_packus_epi16(outReg1, outReg2);
 
     // average if necessary
@@ -177,11 +166,63 @@ static void vpx_filter_block1d16_h8_avg_avx2(
                                  output_height, filter, 1);
 }
 
+static void vpx_filter_block1d8_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i filt[4], f1[4], s1[4], srcReg;
+  __m128i f[4], s[4];
+  int y = output_height;
+
+  // Multiply the size of the source stride by two
+  const ptrdiff_t src_stride = src_pitch << 1;
+
+  shuffle_filter_avx2(filter, f1);
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  // Process next 4 rows
+  while (y > 3) {
+    CALC_CONVOLVE8_HORZ_ROW
+    CALC_CONVOLVE8_HORZ_ROW
+    y -= 4;
+  }
+
+  // If remaining, then process 2 rows at a time
+  while (y > 1) {
+    CALC_CONVOLVE8_HORZ_ROW
+    y -= 2;
+  }
+
+  // For the remaining height.
+  if (y > 0) {
+    const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    f[0] = _mm256_castsi256_si128(f1[0]);
+    f[1] = _mm256_castsi256_si128(f1[1]);
+    f[2] = _mm256_castsi256_si128(f1[2]);
+    f[3] = _mm256_castsi256_si128(f1[3]);
+
+    // filter the source buffer
+    s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0]));
+    s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1]));
+    s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2]));
+    s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3]));
+    s[0] = convolve8_8_ssse3(s, f);
+
+    // Saturate 16bit value to 8bit.
+    s[0] = _mm_packus_epi16(s[0], s[0]);
+
+    // Save only 8 bytes
+    _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
+  }
+}
+
 static INLINE void vpx_filter_block1d16_v8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
     const int avg) {
-  __m128i outReg1, outReg2;
   __m256i srcRegHead1;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
@@ -227,6 +268,9 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
   }
 
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 1));
+
   for (i = output_height; i > 1; i -= 2) {
     __m256i srcRegHead2, srcRegHead3;
 
@@ -257,19 +301,14 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     src_ptr += src_stride;
 
     // average if necessary
-    outReg1 = _mm256_castsi256_si128(s1[0]);
-    outReg2 = _mm256_extractf128_si256(s1[0], 1);
     if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-      outReg2 = _mm_avg_epu8(
-          outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch));
+      s1[0] = _mm256_avg_epu8(s1[0], outReg);
     }
 
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + out_pitch), s1);
 
     output_ptr += dst_stride;
 
@@ -282,35 +321,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     s2[2] = s2[3];
     srcRegHead1 = srcRegHead3;
   }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    // load the last 16 bytes
-    const __m128i srcRegHead2 =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    s1[0] = _mm256_castsi128_si256(
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
-    s2[0] = _mm256_castsi128_si256(
-        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
-
-    outReg1 = convolve8_8_avx2(s1, f);
-    outReg2 = convolve8_8_avx2(s2, f);
-
-    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
-    // contain the first and second convolve result respectively
-    outReg1 = _mm_packus_epi16(outReg1, outReg2);
-
-    // average if necessary
-    if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-    }
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-  }
 }
 
 static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
@@ -560,9 +570,6 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
   const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
   int h;
 
-  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
-  __m256i dst_reg;
-  __m256i tmp_0, tmp_1;
   __m256i idx_shift_0 =
       _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
                        2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
@@ -583,9 +590,11 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
 
   for (h = height; h >= 2; h -= 2) {
     // Load the source
-    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
-    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
-    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+    const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    __m256i dst_reg;
+    __m256i tmp_0, tmp_1;
+    const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
 
     // Get the output
     tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
@@ -606,9 +615,9 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
 
   // Repeat for the last row if needed
   if (h > 0) {
-    __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
     __m128i dst_reg;
-    const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+    const __m128i reg_32_128 = _mm_set1_epi16(32);  // Used for rounding
     __m128i tmp_0, tmp_1;
 
     __m128i src_reg_shift_0 =
@@ -622,7 +631,7 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
                               _mm256_castsi256_si128(kernel_reg_45));
     dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
 
-    dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32, 6);
+    dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32_128, 6);
 
     dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
 
@@ -741,8 +750,6 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
   const ptrdiff_t unrolled_src_stride = src_stride << 1;
   const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
 
-  __m256i src_reg, src_reg_shuf;
-  __m256i dst;
   __m256i shuf_idx =
       _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
                        3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
@@ -759,12 +766,12 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
 
   for (h = height; h > 1; h -= 2) {
     // Load the source
-    src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr,
-                                 (const __m128i *)(src_ptr + src_stride));
-    src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
+    const __m256i src_reg = mm256_loadu2_epi64(
+        (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride));
+    const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
 
     // Get the result
-    dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
+    __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
     dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
 
     // Round result
@@ -783,7 +790,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
 
   if (h > 0) {
     // Load the source
-    const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+    const __m128i reg_32_128 = _mm_set1_epi16(32);  // Used for rounding
     __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
     __m128i src_reg_shuf =
         _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
@@ -794,11 +801,11 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
     dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
 
     // Round result
-    dst = mm_round_epi16_sse2(&dst, &reg_32, 6);
+    dst = mm_round_epi16_sse2(&dst, &reg_32_128, 6);
 
     // Pack to 8-bits
     dst = _mm_packus_epi16(dst, _mm_setzero_si128());
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
   }
 }
 
@@ -892,22 +899,399 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
   }
 }
 
+static void vpx_filter_block1d8_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i f[4], ss[4];
+  __m256i r[8];
+  __m128i s[9];
+
+  unsigned int y = output_height;
+  // Multiply the size of the source stride by two
+  const ptrdiff_t src_stride = src_pitch << 1;
+
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 1));
+
+  shuffle_filter_avx2(filter, f);
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+  // merge the result together
+  // r[0]:    0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0
+  // r07 r06 r05 r04 r03 r02 r01 r00
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+
+  // r[1]:    0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0
+  // r17 r16 r15 r14 r13 r12 r11 r10
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+
+  // r[2]:    0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0
+  // r27 r26 r25 r24 r23 r22 r21 r20
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+
+  // r[3]:    0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0
+  // r37 r36 r35 r34 r33 r32 r31 r30
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+
+  // r[4]:    0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0
+  // r47 r46 r45 r44 r43 r42 r41 r40
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+
+  // r[5]:    0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0
+  // r57 r56 r55 r54 r53 r52 r51 r50
+  r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1);
+
+  // Merge together
+  // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11
+  // r01|r10 r00|
+  ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+  // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31
+  // r21|r30 r20|
+  ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+  // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51
+  // r41|r50 r40|
+  ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+  // Process 2 rows at a time
+  do {
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+    // r[6]:    0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0
+    // 0 r67 r66 r65 r64 r63 r62 r61 r60
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1);
+    // r[7]:    0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0
+    // 0 r77 r76 r75 r74 r73 r72 r71 r70
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1);
+
+    // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72
+    // r62 | r71 r61|r70 r60|
+    ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+    ss[0] = convolve8_16_avx2(ss, f);
+    ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+    src_ptr += src_stride;
+
+    /* shift down two rows */
+    s[6] = s[8];
+    _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0]));
+    output_ptr += out_pitch;
+    _mm_storel_epi64((__m128i *)&output_ptr[0],
+                     _mm256_extractf128_si256(ss[0], 1));
+    output_ptr += out_pitch;
+    ss[0] = ss[1];
+    ss[1] = ss[2];
+    ss[2] = ss[3];
+    y -= 2;
+  } while (y > 1);
+}
+
+static void vpx_filter_block1d4_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64_256bit;
+  unsigned int y = output_height;
+
+  assert(output_height > 1);
+
+  addFilterReg64_256bit = _mm256_set1_epi16(32);
+
+  // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit)
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each)
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  {
+    ptrdiff_t src_stride;
+    __m256i filt1Reg, filt2Reg, firstFilters, secondFilters;
+    // have the same data in both lanes of a 256 bit register
+    // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0
+    // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each)
+    const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+
+    // duplicate only the first 32 bits
+    // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1
+    // f0|f3 f2 f1 f0|f3 f2 f1 f0
+    firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+    // duplicate only the second 32 bits
+    // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5
+    // f4|f7 f6 f5 f4|f7 f6 f5 f4
+    secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+    // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3
+    // s2 s4 s3 s2 s1 s3 s2 s1 s0
+    filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+
+    // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7
+    // s6 s8 s7 s6 s5 s7 s6 s5 s4
+    filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+    // multiple the size of the source and destination stride by two
+    src_stride = src_pitch << 1;
+
+    do {
+      __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1;
+      // load the 2 strides of source
+      // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
+      // r06 r05 r04 r03 r02 r01 r00
+      srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);
+
+      // filter the source buffer
+      // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
+      // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00
+      srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+      // multiply 4 adjacent elements with the filter and add the result
+      // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||...
+      // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00
+      srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+      // filter the source buffer
+      // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010
+      // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+      srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+      // multiply 4 adjacent elements with the filter and add the result
+      // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010
+      // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+      srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+      srcRegFilt32b1_1 =
+          _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit);
+      srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+      srcRegFilt32b1_1 =
+          _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+      // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit)
+      srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+
+      // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit)
+      srcRegFilt32b1_1 =
+          _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+      src_ptr += src_stride;
+      // save first row 4 values
+      *((int *)&output_ptr[0]) =
+          _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1));
+      output_ptr += output_pitch;
+
+      // save second row 4 values
+      *((int *)&output_ptr[0]) =
+          _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+      output_ptr += output_pitch;
+
+      y = y - 2;
+    } while (y > 1);
+
+    // For remaining height
+    if (y > 0) {
+      __m128i srcReg1, srcRegFilt1_1, addFilterReg64;
+      __m128i srcRegFilt2;
+
+      addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+
+      srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+      // filter the source buffer
+      srcRegFilt1_1 =
+          _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+      // multiply 4 adjacent elements with the filter and add the result
+      srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+                                        _mm256_castsi256_si128(firstFilters));
+
+      // filter the source buffer
+      srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+      // multiply 4 adjacent elements with the filter and add the result
+      srcRegFilt2 =
+          _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+      srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+      srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+      // shift by 6 bit each 16 bit
+      srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+      srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+
+      // shrink to 8 bit each 16 bits, the first lane contain the first
+      // convolve result and the second lane contain the second convolve result
+      srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+      // save 4 bytes
+      *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+    }
+  }
+}
+
+static void vpx_filter_block1d4_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i f[4], ss[4];
+  __m256i r[9], rr[2];
+  __m128i s[11];
+
+  unsigned int y = output_height;
+  // Multiply the size of the source stride by four
+  const ptrdiff_t src_stride = src_pitch << 2;
+  const ptrdiff_t out_stride = out_pitch << 2;
+
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 0x01));
+
+  shuffle_filter_avx2(filter, f);
+
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1);
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1);
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1);
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1);
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1);
+
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[0], r[1]);
+
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[1], r[2]);
+
+  // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
+  // r00|
+  ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[2], r[3]);
+
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[3], r[4]);
+
+  // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
+  // r20|
+  ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+  // Process 4 rows at a time
+  while (y >= 4) {
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+    s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
+    s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
+
+    r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1);
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[4], r[5]);
+    rr[1] = _mm256_unpacklo_epi32(r[5], r[6]);
+    ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1);
+    r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[6], r[7]);
+    rr[1] = _mm256_unpacklo_epi32(r[7], r[8]);
+    ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+    ss[0] = convolve8_16_avx2(ss, f);
+
+    // r3 r2 r3 r2 r1 r0 r1 r0
+    ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+    src_ptr += src_stride;
+
+    mm256_storeu2_epi32((__m128i *const)output_ptr,
+                        (__m128i *const)(output_ptr + (2 * out_pitch)), ss);
+
+    ss[0] = _mm256_srli_si256(ss[0], 4);
+
+    mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)),
+                        (__m128i *const)(output_ptr + (3 * out_pitch)), ss);
+
+    output_ptr += out_stride;
+
+    ss[0] = ss[2];
+    ss[1] = ss[3];
+
+    s[6] = s[10];
+    s[5] = s[9];
+
+    r[4] = r[8];
+    y -= 4;
+  }
+
+  // Process 2 rows
+  if (y == 2) {
+    __m128i ss1[4], f1[4], r1[4];
+
+    s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+    f1[0] = _mm256_castsi256_si128(f[0]);
+    f1[1] = _mm256_castsi256_si128(f[1]);
+    f1[2] = _mm256_castsi256_si128(f[2]);
+    f1[3] = _mm256_castsi256_si128(f[3]);
+
+    r1[0] = _mm_unpacklo_epi32(s[4], s[5]);
+    r1[1] = _mm_unpacklo_epi32(s[5], s[6]);
+
+    // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+    r1[2] = _mm_unpacklo_epi32(s[6], s[7]);
+
+    // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+    r1[3] = _mm_unpacklo_epi32(s[7], s[8]);
+
+    // r23 r13....r20 r10|r13 r03....r10 r00
+    ss1[0] = _mm256_castsi256_si128(ss[0]);
+
+    // r43 r33....r40 r30|r33 r23....r30 r20
+    ss1[1] = _mm256_castsi256_si128(ss[1]);
+
+    // r63 r53....r60 r50|r53 r43....r50 r40
+    ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]);
+
+    // r83 r73....r80 r70|r73 r63....r70 r60
+    ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]);
+
+    ss1[0] = convolve8_8_ssse3(ss1, f1);
+
+    // r1 r0 r1 r0
+    ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]);
+
+    // Save first row 4 values
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+    output_ptr += out_pitch;
+
+    ss1[0] = _mm_srli_si128(ss1[0], 4);
+    // Save second row 4 values
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+  }
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
-#else  // VPX_ARCH_X86
+#else   // VPX_ARCH_X86
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
@@ -923,7 +1307,6 @@ filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
 #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
@@ -969,12 +1352,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
-            avx2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+            avx2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -986,6 +1369,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, avx2, 0);
-FUN_CONV_2D(avg_, avx2, 1);
+FUN_CONV_2D(, avx2, 0)
+FUN_CONV_2D(avg_, avx2, 1)
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 77355a2085..4ea2752d38 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -580,7 +580,7 @@ static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
 
     // Pack to 8-bits
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
 
     src_ptr += src_stride;
     dst_ptr += dst_stride;
@@ -666,8 +666,8 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
     reg_1 = _mm_packus_epi16(reg_1, reg_1);
 
     // Save the result
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
-    *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
+    *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
 
     // Update the source by two rows
     src_ptr += src_stride_unrolled;
@@ -731,12 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
-            ssse3, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
+            ssse3, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
+            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1)
 
 static void filter_horiz_w8_ssse3(const uint8_t *const src,
                                   const ptrdiff_t src_stride,
@@ -1083,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, ssse3, 0);
-FUN_CONV_2D(avg_, ssse3, 1);
+FUN_CONV_2D(, ssse3, 0)
+FUN_CONV_2D(avg_, ssse3, 1)
diff --git a/vpx_ports/aarch32_cpudetect.c b/vpx_ports/aarch32_cpudetect.c
new file mode 100644
index 0000000000..639f4ff8ea
--- /dev/null
+++ b/vpx_ports/aarch32_cpudetect.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+// Feature detection code for Armv7-A / AArch32.
+
+#include "./vpx_config.h"
+#include "arm_cpudetect.h"
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(_MSC_VER)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON || HAVE_NEON_ASM
+  // MSVC has no inline __asm support for Arm, but it does let you __emit
+  // instructions via their assembled hex code.
+  // All of these instructions should be essentially nops.
+  __try {
+    // VORR q0,q0,q0
+    __emit(0xF2200150);
+    flags |= HAS_NEON;
+  } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+    // Ignore exception.
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON || HAVE_NEON_ASM
+  uint64_t features = android_getCpuFeatures();
+  if (features & ANDROID_CPU_ARM_FEATURE_NEON) {
+    flags |= HAS_NEON;
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(AOM_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define VPX_AARCH32_HWCAP_NEON (1 << 12)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+  unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON || HAVE_NEON_ASM
+  if (hwcap & VPX_AARCH32_HWCAP_NEON) {
+    flags |= HAS_NEON;
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+#else   // end __linux__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method available" \
+"for your platform. Rerun configure with --disable-runtime-cpu-detect."
+#endif
+
+int arm_cpu_caps(void) {
+  int flags = 0;
+  if (arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  return arm_get_cpu_caps() & arm_cpu_env_mask();
+}
diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c
new file mode 100644
index 0000000000..539d09bb39
--- /dev/null
+++ b/vpx_ports/aarch64_cpudetect.c
@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "arm_cpudetect.h"
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+// sysctlbyname() parameter documentation for instruction set characteristics:
+// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+static INLINE int64_t have_feature(const char *feature) {
+  int64_t feature_present = 0;
+  size_t size = sizeof(feature_present);
+  if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
+    return 0;
+  }
+  return feature_present;
+}
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+  if (have_feature("hw.optional.arm.FEAT_DotProd")) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (have_feature("hw.optional.arm.FEAT_I8MM")) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+  return flags;
+}
+
+#elif defined(_WIN32)  // end __APPLE__
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+// IsProcessorFeaturePresent() parameter documentation:
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK
+// 20348, supported by Windows 11 and Windows Server 2022.
+#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+  if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+#endif  // HAVE_NEON_DOTPROD
+  // No I8MM or SVE feature detection available on Windows at time of writing.
+  return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(VPX_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define VPX_AARCH64_HWCAP_SVE (1 << 22)
+#define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+  unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON_I8MM
+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#endif  // HAVE_NEON_I8MM
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+  if (hwcap & VPX_AARCH64_HWCAP_ASIMDDP) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (hwcap2 & VPX_AARCH64_HWCAP2_I8MM) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (hwcap & VPX_AARCH64_HWCAP_SVE) {
+    flags |= HAS_SVE;
+  }
+#endif  // HAVE_SVE
+  return flags;
+}
+
+#elif defined(__Fuchsia__)  // end __linux__
+
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282.
+#ifndef ZX_ARM64_FEATURE_ISA_I8MM
+#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19))
+#endif
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083.
+#ifndef ZX_ARM64_FEATURE_ISA_SVE
+#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20))
+#endif
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  uint32_t features;
+  zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+  if (status != ZX_OK) {
+    return flags;
+  }
+#if HAVE_NEON_DOTPROD
+  if (features & ZX_ARM64_FEATURE_ISA_DP) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (features & ZX_ARM64_FEATURE_ISA_I8MM) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (features & ZX_ARM64_FEATURE_ISA_SVE) {
+    flags |= HAS_SVE;
+  }
+#endif  // HAVE_SVE
+  return flags;
+}
+
+#else  // end __Fuchsia__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method available" \
+"for your platform. Rerun configure with --disable-runtime-cpu-detect."
+#endif
+
+int arm_cpu_caps(void) {
+  int flags = 0;
+  if (!arm_cpu_env_flags(&flags)) {
+    flags = arm_get_cpu_caps() & arm_cpu_env_mask();
+  }
+
+  // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
+  if (!(flags & HAS_NEON_DOTPROD)) {
+    flags &= ~HAS_NEON_I8MM;
+  }
+
+  // Restrict flags: FEAT_SVE assumes that FEAT_{DotProd,I8MM} are available.
+  if (!(flags & HAS_NEON_DOTPROD)) {
+    flags &= ~HAS_SVE;
+  }
+  if (!(flags & HAS_NEON_I8MM)) {
+    flags &= ~HAS_SVE;
+  }
+
+  return flags;
+}
diff --git a/vpx_ports/arm.h b/vpx_ports/arm.h
index 6458a2c5b0..39365d18ee 100644
--- a/vpx_ports/arm.h
+++ b/vpx_ports/arm.h
@@ -17,12 +17,14 @@
 extern "C" {
 #endif
 
-/*ARMv5TE "Enhanced DSP" instructions.*/
-#define HAS_EDSP 0x01
-/*ARMv6 "Parallel" or "Media" instructions.*/
-#define HAS_MEDIA 0x02
-/*ARMv7 optional NEON instructions.*/
-#define HAS_NEON 0x04
+// Armv7-A optional Neon instructions, mandatory from Armv8.0-A.
+#define HAS_NEON (1 << 0)
+// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A.
+#define HAS_NEON_DOTPROD (1 << 1)
+// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A.
+#define HAS_NEON_I8MM (1 << 2)
+// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
+#define HAS_SVE (1 << 3)
 
 int arm_cpu_caps(void);
 
diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
deleted file mode 100644
index 4f9d480ade..0000000000
--- a/vpx_ports/arm_cpudetect.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/arm.h"
-
-#ifdef WINAPI_FAMILY
-#include <winapifamily.h>
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define getenv(x) NULL
-#endif
-#endif
-
-static int arm_cpu_env_flags(int *flags) {
-  char *env;
-  env = getenv("VPX_SIMD_CAPS");
-  if (env && *env) {
-    *flags = (int)strtol(env, NULL, 0);
-    return 0;
-  }
-  *flags = 0;
-  return -1;
-}
-
-static int arm_cpu_env_mask(void) {
-  char *env;
-  env = getenv("VPX_SIMD_CAPS_MASK");
-  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
-}
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-int arm_cpu_caps(void) {
-  /* This function should actually be a no-op. There is no way to adjust any of
-   * these because the RTCD tables do not exist: the functions are called
-   * statically */
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-#if HAVE_NEON || HAVE_NEON_ASM
-  flags |= HAS_NEON;
-#endif /* HAVE_NEON  || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
-/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#ifndef WIN32_EXTRA_LEAN
-#define WIN32_EXTRA_LEAN
-#endif
-#include <windows.h>
-
-int arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-/* MSVC has no inline __asm support for ARM, but it does let you __emit
- *  instructions via their assembled hex code.
- * All of these instructions should be essentially nops.
- */
-#if HAVE_NEON || HAVE_NEON_ASM
-  if (mask & HAS_NEON) {
-    __try {
-      /*VORR q0,q0,q0*/
-      __emit(0xF2200150);
-      flags |= HAS_NEON;
-    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
-      /*Ignore exception.*/
-    }
-  }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(__ANDROID__) /* end _MSC_VER */
-#include <cpu-features.h>
-
-int arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  uint64_t features;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  features = android_getCpuFeatures();
-
-#if HAVE_NEON || HAVE_NEON_ASM
-  if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(__linux__) /* end __ANDROID__ */
-
-#include <stdio.h>
-
-int arm_cpu_caps(void) {
-  FILE *fin;
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
-   *  on Android.
-   * This also means that detection will fail in Scratchbox.
-   */
-  fin = fopen("/proc/cpuinfo", "r");
-  if (fin != NULL) {
-    /* 512 should be enough for anybody (it's even enough for all the flags
-     * that x86 has accumulated... so far).
-     */
-    char buf[512];
-    while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_NEON || HAVE_NEON_ASM
-      if (memcmp(buf, "Features", 8) == 0) {
-        char *p;
-        p = strstr(buf, " neon");
-        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
-          flags |= HAS_NEON;
-        }
-      }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-    }
-    fclose(fin);
-  }
-  return flags & mask;
-}
-#else  /* end __linux__ */
-#error \
-    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
-"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
-#endif
diff --git a/vpx_ports/arm_cpudetect.h b/vpx_ports/arm_cpudetect.h
new file mode 100644
index 0000000000..881397abc2
--- /dev/null
+++ b/vpx_ports/arm_cpudetect.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_config.h"
+#include "vpx_ports/arm.h"
+
+#if defined(_WIN32)
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#undef WIN32_EXTRA_LEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+#endif
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
+#define ANDROID_USE_CPU_FEATURES_LIB 1
+// Use getauxval() when targeting (64-bit) Android with API level >= 18.
+// getauxval() is supported since Android API level 18 (Android 4.3.)
+// First Android version with 64-bit support was Android 5.x (API level 21).
+#include <cpu-features.h>
+#endif
+
+static INLINE int arm_cpu_env_flags(int *flags) {
+  const char *env = getenv("VPX_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 1;
+  }
+  return 0;
+}
+
+static INLINE int arm_cpu_env_mask(void) {
+  const char *env = getenv("VPX_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
diff --git a/vpx_ports/asmdefs_mmi.h b/vpx_ports/asmdefs_mmi.h
index 28355bf9fb..400a51cc32 100644
--- a/vpx_ports/asmdefs_mmi.h
+++ b/vpx_ports/asmdefs_mmi.h
@@ -34,7 +34,7 @@
   "ld          " #reg ",        " #bias "(" #addr ")               \n\t"
 
 #define MMI_SRL(reg1, reg2, shift) \
-  "dsrl        " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+  "ssrld       " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
 
 #define MMI_SLL(reg1, reg2, shift) \
   "dsll        " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
@@ -63,7 +63,7 @@
   "lw          " #reg ",        " #bias "(" #addr ")               \n\t"
 
 #define MMI_SRL(reg1, reg2, shift) \
-  "srl         " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+  "ssrlw       " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
 
 #define MMI_SLL(reg1, reg2, shift) \
   "sll         " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
diff --git a/vpx_ports/bitops.h b/vpx_ports/bitops.h
index 5b2f31cd11..1b5cdaa6dd 100644
--- a/vpx_ports/bitops.h
+++ b/vpx_ports/bitops.h
@@ -26,20 +26,32 @@
 extern "C" {
 #endif
 
-// These versions of get_msb() are only valid when n != 0 because all
-// of the optimized versions are undefined when n == 0:
+// These versions of get_lsb() and get_msb() are only valid when n != 0
+// because all of the optimized versions are undefined when n == 0:
 // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
 
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_lsb(unsigned int n) {
+  assert(n != 0);
+  return __builtin_ctz(n);
+}
+
 static INLINE int get_msb(unsigned int n) {
   assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
 #elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanForward)
 #pragma intrinsic(_BitScanReverse)
 
+static INLINE int get_lsb(unsigned int n) {
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  _BitScanForward(&first_set_bit, n);
+  return first_set_bit;
+}
+
 static INLINE int get_msb(unsigned int n) {
   unsigned long first_set_bit;
   assert(n != 0);
@@ -48,6 +60,13 @@ static INLINE int get_msb(unsigned int n) {
 }
 #undef USE_MSC_INTRINSICS
 #else
+static INLINE int get_lsb(unsigned int n) {
+  int i;
+  assert(n != 0);
+  for (i = 0; i < 32 && !(n & 1); ++i) n >>= 1;
+  return i;
+}
+
 // Returns (int)floor(log2(n)). n must be > 0.
 static INLINE int get_msb(unsigned int n) {
   int log = 0;
diff --git a/vpx_ports/compiler_attributes.h b/vpx_ports/compiler_attributes.h
index 354352016c..4b468749b8 100644
--- a/vpx_ports/compiler_attributes.h
+++ b/vpx_ports/compiler_attributes.h
@@ -29,13 +29,23 @@
 #endif  // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
 
 #if defined(__clang__) && __has_attribute(no_sanitize)
+// Both of these have defined behavior and are used in certain operations or
+// optimizations thereof. There are cases where an overflow may be unintended,
+// however, so use of these attributes should be done with care.
 #define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
   __attribute__((no_sanitize("unsigned-integer-overflow")))
-#endif
+#if __clang_major__ >= 12
+#define VPX_NO_UNSIGNED_SHIFT_CHECK \
+  __attribute__((no_sanitize("unsigned-shift-base")))
+#endif  // __clang__ >= 12
+#endif  // __clang__
 
 #ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
 #define VPX_NO_UNSIGNED_OVERFLOW_CHECK
 #endif
+#ifndef VPX_NO_UNSIGNED_SHIFT_CHECK
+#define VPX_NO_UNSIGNED_SHIFT_CHECK
+#endif
 
 //------------------------------------------------------------------------------
 // Variable attributes.
diff --git a/vpx_ports/loongarch.h b/vpx_ports/loongarch.h
new file mode 100644
index 0000000000..d93ff9f5f0
--- /dev/null
+++ b/vpx_ports/loongarch.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Jin Bo  <jinbo@loongson.cn>
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_LOONGARCH_H_
+#define VPX_VPX_PORTS_LOONGARCH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_LSX 0x01
+#define HAS_LASX 0x02
+
+int loongarch_cpu_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_PORTS_LOONGARCH_H_
diff --git a/vpx_ports/loongarch_cpudetect.c b/vpx_ports/loongarch_cpudetect.c
new file mode 100644
index 0000000000..7b4322d35e
--- /dev/null
+++ b/vpx_ports/loongarch_cpudetect.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Jin Bo  <jinbo@loongson.cn>
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_ports/loongarch.h"
+
+#define LOONGARCH_CFG2 0x02
+#define LOONGARCH_CFG2_LSX (1 << 6)
+#define LOONGARCH_CFG2_LASX (1 << 7)
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#if defined(__loongarch__) && defined(__linux__)
+int loongarch_cpu_caps(void) {
+  int reg = 0;
+  int flag = 0;
+
+  __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(reg) : "r"(LOONGARCH_CFG2));
+  if (reg & LOONGARCH_CFG2_LSX) flag |= HAS_LSX;
+
+  if (reg & LOONGARCH_CFG2_LASX) flag |= HAS_LASX;
+
+  return flag;
+}
+#else /* end __loongarch__ && __linux__ */
+#error \
+    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
+#else /* end CONFIG_RUNTIME_CPU_DETECT */
+int loongarch_cpu_caps(void) { return 0; }
+#endif
diff --git a/vpx_ports/mips.h b/vpx_ports/mips.h
new file mode 100644
index 0000000000..439de754fd
--- /dev/null
+++ b/vpx_ports/mips.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_MIPS_H_
+#define VPX_VPX_PORTS_MIPS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_MMI 0x01
+#define HAS_MSA 0x02
+
+int mips_cpu_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_PORTS_MIPS_H_
diff --git a/vpx_ports/mips_cpudetect.c b/vpx_ports/mips_cpudetect.c
new file mode 100644
index 0000000000..e0eca2d48d
--- /dev/null
+++ b/vpx_ports/mips_cpudetect.c
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "./vpx_config.h"
+#include "vpx_ports/mips.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#if defined(__mips__) && defined(__linux__)
+int mips_cpu_caps(void) {
+  char cpuinfo_line[512];
+  int flag = 0x0;
+  FILE *f = fopen("/proc/cpuinfo", "r");
+  if (!f) {
+    // Assume nothing if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return 0;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+      // Workaround early kernel without mmi in ASEs line.
+      if (strstr(cpuinfo_line, "Loongson-3")) {
+        flag |= HAS_MMI;
+      } else if (strstr(cpuinfo_line, "Loongson-2K")) {
+        flag |= HAS_MMI | HAS_MSA;
+      }
+    }
+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+      if (strstr(cpuinfo_line, "loongson-mmi") &&
+          strstr(cpuinfo_line, "loongson-ext")) {
+        flag |= HAS_MMI;
+      }
+      if (strstr(cpuinfo_line, "msa")) {
+        flag |= HAS_MSA;
+      }
+      // ASEs is the last line, so we can break here.
+      break;
+    }
+  }
+  fclose(f);
+  return flag;
+}
+#else /* end __mips__ && __linux__ */
+#error \
+    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
+#else /* end CONFIG_RUNTIME_CPU_DETECT */
+int mips_cpu_caps(void) { return 0; }
+#endif
diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h
index 4eb592b87e..d8a8ed89fe 100644
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h
@@ -95,7 +95,7 @@ static void once(void (*func)(void)) {
 #define INCL_DOS
 #include <os2.h>
 static void once(void (*func)(void)) {
-  static int done;
+  static volatile int done;
 
   /* If the initialization is complete, return early. */
   if (done) return;
@@ -128,7 +128,7 @@ static void once(void (*func)(void)) {
  */
 
 static void once(void (*func)(void)) {
-  static int done;
+  static volatile int done;
 
   if (!done) {
     func();
diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk
index 2331773693..93279dbebc 100644
--- a/vpx_ports/vpx_ports.mk
+++ b/vpx_ports/vpx_ports.mk
@@ -36,12 +36,23 @@ PORTS_SRCS-yes += x86.h
 PORTS_SRCS-yes += x86_abi_support.asm
 endif
 
-PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.c
+ifeq ($(VPX_ARCH_AARCH64),yes)
+PORTS_SRCS-yes += aarch64_cpudetect.c
+else
+PORTS_SRCS-$(VPX_ARCH_ARM) += aarch32_cpudetect.c
+endif
+PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.h
 PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h
 
 PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c
 PORTS_SRCS-$(VPX_ARCH_PPC) += ppc.h
 
+PORTS_SRCS-$(VPX_ARCH_MIPS) += mips_cpudetect.c
+PORTS_SRCS-$(VPX_ARCH_MIPS) += mips.h
+
+PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch_cpudetect.c
+PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch.h
+
 ifeq ($(VPX_ARCH_MIPS), yes)
 PORTS_SRCS-yes += asmdefs_mmi.h
 endif
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 14f4344495..795fb2923f 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -47,7 +47,7 @@ typedef enum {
 #define cpuid(func, func2, ax, bx, cx, dx)                      \
   __asm__ __volatile__("cpuid           \n\t"                   \
                        : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
-                       : "a"(func), "c"(func2));
+                       : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   __asm__ __volatile__(                        \
@@ -55,7 +55,7 @@ typedef enum {
       "cpuid              \n\t"                \
       "xchg %%edi, %%ebx  \n\t"                \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #elif defined(__SUNPRO_C) || \
     defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
@@ -67,7 +67,7 @@ typedef enum {
       "movl %ebx, %edi \n\t"                   \
       "xchg %rsi, %rbx \n\t"                   \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   asm volatile(                                \
@@ -76,7 +76,7 @@ typedef enum {
       "movl %ebx, %edi  \n\t"                  \
       "popl %ebx        \n\t"                  \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #else /* end __SUNPRO__ */
 #if VPX_ARCH_X86_64
@@ -223,6 +223,8 @@ static INLINE int x86_simd_caps(void) {
     }
   }
 
+  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
+
   return flags & mask;
 }
 
@@ -240,7 +242,7 @@ static INLINE int x86_simd_caps(void) {
 // x86_readtsc directly, but prevent the CPU's out-of-order execution from
 // affecting the measurement (by having earlier/later instructions be evaluated
 // in the time interval). See the white paper, "How to Benchmark Code
-// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by
+// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by
 // Gabriele Paoloni for more information.
 //
 // If you are timing a large function (CPU time > a couple of seconds), use
@@ -306,14 +308,26 @@ static INLINE unsigned int x86_readtscp(void) {
 
 static INLINE unsigned int x86_tsc_start(void) {
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return x86_readtsc();
 }
 
 static INLINE unsigned int x86_tsc_end(void) {
   uint32_t v = x86_readtscp();
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return v;
 }
 
@@ -377,7 +391,7 @@ static INLINE unsigned int x87_set_double_precision(void) {
   // Reserved                      01B
   // Double Precision (53-Bits)    10B
   // Extended Precision (64-Bits)  11B
-  x87_set_control_word((mode & ~0x300) | 0x200);
+  x87_set_control_word((mode & ~0x300u) | 0x200u);
   return mode;
 }
 
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index eee291c30d..c52dab0588 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -64,6 +64,10 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width,
 
     if (!ybf->buffer_alloc) {
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
+      if (!ybf->buffer_alloc) {
+        ybf->buffer_alloc_sz = 0;
+        return -1;
+      }
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
       // This memset is needed for fixing the issue of using uninitialized
@@ -75,7 +79,7 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width,
       ybf->buffer_alloc_sz = frame_size;
     }
 
-    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) return -1;
+    if (ybf->buffer_alloc_sz < frame_size) return -1;
 
     /* Only support allocating buffers that have a border that's a multiple
      * of 32. The border restriction is required to get 16-byte alignment of
diff --git a/vpx_util/loongson_intrinsics.h b/vpx_util/loongson_intrinsics.h
new file mode 100644
index 0000000000..b8b9e6db02
--- /dev/null
+++ b/vpx_util/loongson_intrinsics.h
@@ -0,0 +1,2090 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+#define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei Gu   <guxiwei-hf@loongson.cn>
+ *                Lu Wang    <wanglu@loongson.cn>
+ *
+ * This file is a header file for loongarch builtin extension.
+ *
+ */
+
+#ifndef LOONGSON_INTRINSICS_H
+#define LOONGSON_INTRINSICS_H
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LSOM_VERSION_MAJOR 1
+#define LSOM_VERSION_MINOR 2
+#define LSOM_VERSION_MICRO 1
+
+#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
+  {                                               \
+    _OUT0 = _INS(_IN0);                           \
+    _OUT1 = _INS(_IN1);                           \
+  }
+
+#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
+  {                                                           \
+    _OUT0 = _INS(_IN0, _IN1);                                 \
+    _OUT1 = _INS(_IN2, _IN3);                                 \
+  }
+
+#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
+  {                                                                       \
+    _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
+    _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
+  }
+
+#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
+  {                                                                         \
+    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
+    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
+  }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+                  _OUT1, _OUT2, _OUT3)                                         \
+  {                                                                            \
+    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
+    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
+  }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+                  _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
+  {                                                                           \
+    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
+    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
+  }
+
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input. Then
+ *               the results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
+                                         __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ *        in_c : 1,1,1,1, 1,1,1,1
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ *         out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
+                                           __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of half-word vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - __m128i
+ * Details     : Signed half-word elements from in_h are multiplied by
+ *               signed half-word elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               Then the results are added to signed word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
+ *         out : 22,38,38,22, 22,38,38,6
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_w_h(in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - double
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_d_w(in_h, in_l)
+ *        in_h : 1,2,3,4
+ *        in_l : 8,7,6,5
+ *         out : 22,38
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_d_w(in_h, in_l);
+  out = __lsx_vmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ *               (_in))
+ * Arguments   : Inputs  - _in  (input vector)
+ *                       - min  (min threshold)
+ *                       - max  (max threshold)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lsx_vclip_h(_in)
+ *         _in : -8,2,280,249, -8,255,280,249
+ *         min : 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+  __m128i out;
+
+  out = __lsx_vmax_h(min, _in);
+  out = __lsx_vmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_h(_in)
+ *         _in : -8,255,280,249, -8,255,280,249
+ *         out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_h(_in, 0);
+  out = __lsx_vsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - word
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_w(_in)
+ *         _in : -8,255,280,249
+ *         out : 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_w(_in, 0);
+  out = __lsx_vsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Swap two variables
+ * Arguments   : Inputs  - _in0, _in1
+ *               Outputs - _in0, _in1 (in-place)
+ * Details     : Swapping of two input variables using xor
+ * Example     : LSX_SWAP(_in0, _in1)
+ *        _in0 : 1,2,3,4
+ *        _in1 : 5,6,7,8
+ *   _in0(out) : 5,6,7,8
+ *   _in1(out) : 1,2,3,4
+ * =============================================================================
+ */
+#define LSX_SWAP(_in0, _in1)         \
+  {                                  \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+    _in1 = __lsx_vxor_v(_in0, _in1); \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _t0, _t1, _t2, _t3;                                                \
+                                                                               \
+    _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
+    _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
+    _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
+    _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
+    _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
+    _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
+    _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
+    _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with byte elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Details     : The rows of the matrix become columns, and the columns
+ *               become rows.
+ * Example     : LSX_TRANSPOSE8x8_B
+ *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
+ *
+ *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
+ *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
+ *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
+ *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i zero = { 0 };                                                   \
+    __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
+                                                                            \
+    _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
+    _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
+    _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
+    _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
+    _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
+    _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
+    _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
+    _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
+    _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
+    _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
+    _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
+    _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
+    _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
+    _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
+    _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
+    _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
+ *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
+ *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
+ *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
+ *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
+ *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
+ *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
+ *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
+                                                                            \
+    _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
+    _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
+    _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
+    _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
+                                                                            \
+    _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
+    _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
+    _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
+    _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
+    _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
+    _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
+    _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
+    _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x4 byte block into 4x8
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
+ *               Return Type - as per RTYPE
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LSX_TRANSPOSE8x4_B
+ *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
+ *
+ *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+                           _out0, _out1, _out2, _out3)                     \
+  {                                                                        \
+    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
+                                                                           \
+    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
+    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
+                                                                           \
+    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
+    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
+                                                                           \
+    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
+    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
+    _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
+    _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
+ *                         in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              000,001,002,003,004,005,006,007
+ *              008,009,010,011,012,013,014,015
+ *              016,017,018,019,020,021,022,023
+ *              024,025,026,027,028,029,030,031
+ *              032,033,034,035,036,037,038,039
+ *              040,041,042,043,044,045,046,047        000,008,...,112,120
+ *              048,049,050,051,052,053,054,055        001,009,...,113,121
+ *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
+ *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
+ *              072,073,074,075,076,077,078,079        004,012,...,116,124
+ *              080,081,082,083,084,085,086,087        005,013,...,117,125
+ *              088,089,090,091,092,093,094,095        006,014,...,118,126
+ *              096,097,098,099,100,101,102,103        007,015,...,119,127
+ *              104,105,106,107,108,109,110,111
+ *              112,113,114,115,116,117,118,119
+ *              120,121,122,123,124,125,126,127
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                            _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                            _out6, _out7)                                    \
+  {                                                                          \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
+    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+              _tmp0, _tmp1, _tmp2, _tmp3);                                   \
+    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
+              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
+    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
+    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Butterfly operation
+ * Example     :
+ *               out0 = in0 + in3;
+ *               out1 = in1 + in2;
+ *               out2 = in1 - in2;
+ *               out3 = in0 - in3;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_b(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_b(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_b(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_h(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_h(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_h(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_w(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_w(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_w(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_d(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_d(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_d(_in0, _in3);                                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in7;
+ *              _out1 = _in1 + _in6;
+ *              _out2 = _in2 + _in5;
+ *              _out3 = _in3 + _in4;
+ *              _out4 = _in3 - _in4;
+ *              _out5 = _in2 - _in5;
+ *              _out6 = _in1 - _in6;
+ *              _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_b(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_b(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_b(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_b(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_b(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_b(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_b(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_b(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_h(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_h(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_h(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_h(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_h(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_h(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_h(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_h(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_w(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_w(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_w(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_w(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_w(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_w(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_w(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_w(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_d(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_d(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_d(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_d(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_d(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_d(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_d(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_d(_in0, _in7);                                      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 16 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in15;
+ *              _out1 = _in1 + _in14;
+ *              _out2 = _in2 + _in13;
+ *              _out3 = _in3 + _in12;
+ *              _out4 = _in4 + _in11;
+ *              _out5 = _in5 + _in10;
+ *              _out6 = _in6 + _in9;
+ *              _out7 = _in7 + _in8;
+ *              _out8 = _in7 - _in8;
+ *              _out9 = _in6 - _in9;
+ *              _out10 = _in5 - _in10;
+ *              _out11 = _in4 - _in11;
+ *              _out12 = _in3 - _in12;
+ *              _out13 = _in2 - _in13;
+ *              _out14 = _in1 - _in14;
+ *              _out15 = _in0 - _in15;
+ * =============================================================================
+ */
+
+#define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_b(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_b(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_b(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_b(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_b(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_b(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_b(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_b(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_b(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_b(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_b(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_b(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_b(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_b(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_b(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_h(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_h(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_h(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_h(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_h(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_h(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_h(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_h(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_h(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_h(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_h(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_h(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_h(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_h(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_h(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_w(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_w(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_w(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_w(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_w(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_w(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_w(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_w(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_w(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_w(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_w(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_w(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_w(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_w(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_w(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_d(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_d(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_d(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_d(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_d(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_d(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_d(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_d(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_d(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_d(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_d(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_d(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_d(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_d(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_d(_in0, _in15);                                        \
+  }
+
+#endif  // LSX
+
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplication results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_b(in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of word vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed double
+ * Details     : Signed word elements from in_h are multiplied with
+ *               signed word elements from in_l producing a result
+ *               twice the size of input i.e. signed double-word.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_d_w(in_h, in_l);
+  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. unsigned word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - per RTYPE
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               unsigned halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Unsigned Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  out = __lasx_xvsub_h(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Signed Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               Signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ *        in_c : 0,0,0,0, 0,0,0,0
+ *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *         out : -7,-3,0,0, 0,-1,0,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvsub_w(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               four times the size of input i.e. signed doubleword.
+ *               Then these multiplication results of four adjacent elements
+ *               are added together and stored to the out vector.
+ * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
+ *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
+ *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
+ *         out : -2,0,1,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvhaddw_d_w(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 1,0,0,-1, 1,0,0, 2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 5,-1,4,2, 1,0,2,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The out vector and the out vector are added after the
+ *               lower half of the two-fold zero extension (unsigned byte
+ *               to unsigned halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_hu_bu(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double zero extension (unsigned byte to
+ *               signed halfword)，added to the in_h vector.
+ * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_hu_bu(in_l, 0);
+  out = __lasx_xvadd_h(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double sign extension (signed halfword to
+ *               signed word), added to the in_h vector.
+ * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ *        in_h : 0, 1,0,0, -1,0,0,1,
+ *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
+ *         out : 2, 0,1,2, -1,0,1,1,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvadd_w(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the lower half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed halfword
+ *               to signed word), and the result is added to the vector in_c,
+ *               then stored to the out vector.
+ * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 5,6,7,8
+ *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
+ *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
+ *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
+ *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the higher half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the higher half of the two-fold sign extension (signed
+ *               halfword to signed word), and the result is added to
+ *               the vector in_c, then stored to the out vector.
+ * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 6,1,3,0, 0,0,1,0
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvmul_w(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 0,0,0,0, 0,0,0,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are added to the high half
+ *               after being doubled, then saturated.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector adds the in_l vector after the lower half of
+ *               the two-fold zero extension (unsigned byte to unsigned
+ *               halfword) and then saturated. The results are stored to the out
+ *               vector.
+ * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
+ *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ *               0,0,0,1
+ *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+  __m256i tmp1, out;
+  __m256i zero = { 0 };
+
+  tmp1 = __lasx_xvilvl_b(zero, in_l);
+  out = __lasx_xvsadd_hu(in_h, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
+ * Arguments   : Inputs  - in    (input vector)
+ *                       - min   (min threshold)
+ *                       - max   (max threshold)
+ *               Outputs - in    (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lasx_xvclip_h(in, min, max)
+ *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
+ *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+  __m256i out;
+
+  out = __lasx_xvmax_h(min, in);
+  out = __lasx_xvmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs  - in   (input vector)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : See out = __lasx_xvclip255_w(in)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_h(in, 0);
+  out = __lasx_xvsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed word elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs - in   (input vector)
+ *               Output - out  (output vector with clipped elements)
+ *               Return Type - signed word
+ * Example     : out = __lasx_xvclip255_w(in)
+ *          in : -8,255,280,249, -8,255,280,249
+ *         out :  0,255,255,249,  0,255,255,249
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_w(in, 0);
+  out = __lasx_xvsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_l_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
+ *         idx : 0x02
+ *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x02);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_h_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
+ *         idx : 0x09
+ *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x13);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
+    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
+    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
+    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
+    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
+    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
+    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
+    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
+    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
+    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
+    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
+    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
+    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
+    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE16x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
+    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
+    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
+    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
+    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
+    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
+    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
+    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
+    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
+    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
+    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE16x8_H
+ *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
+ *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+                                                                              \
+    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with halfword elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ *               Return Type - signed halfword
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _s0_m, _s1_m;                                                \
+                                                                         \
+    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
+    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
+    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
+    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
+    _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
+    _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *                         (input 8x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x8 byte block)
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
+    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
+    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
+    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
+    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
+    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
+    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
+    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
+    _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
+    _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
+    _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
+    _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with halfword elements in vectors.
+ * Arguments   : Inputs  - _in0, _in1, ~
+ *               Outputs - _out0, _out1, ~
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE8x8_H
+ *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *
+ *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
+ *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
+ *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
+ *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
+ *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
+ *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
+ *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
+    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
+    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
+    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
+    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
+    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
+    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
+    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
+    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
+    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
+    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
+    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
+  }
+
+#endif  // LASX
+
+/*
+ * =============================================================================
+ * Description : Print out elements in vector.
+ * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
+ *               Outputs -
+ * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
+ *               '_enter' is TRUE, prefix "\nVP:" will be added first.
+ * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
+ *               VP:1,2,3,4,
+ * =============================================================================
+ */
+#define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
+  {                                                                \
+    RTYPE _tmp0 = (RTYPE)in0;                                      \
+    int _i = 0;                                                    \
+    if (enter) printf("\nVP:");                                    \
+    for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
+  }
+
+#endif /* LOONGSON_INTRINSICS_H */
+#endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */
diff --git a/vpxdec.c b/vpxdec.c
index ad368a230b..bfe6c1d6ba 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -446,7 +446,7 @@ static void generate_filename(const char *pattern, char *out, size_t q_len,
         case '7': snprintf(q, q_len - 1, "%07d", frame_in); break;
         case '8': snprintf(q, q_len - 1, "%08d", frame_in); break;
         case '9': snprintf(q, q_len - 1, "%09d", frame_in); break;
-        default: die("Unrecognized pattern %%%c\n", p[1]); break;
+        default: die("Unrecognized pattern %%%c\n", p[1]);
       }
 
       pat_len = strlen(q);
@@ -581,7 +581,10 @@ static int main_loop(int argc, const char **argv_) {
   /* Parse command line */
   exec_name = argv_[0];
   argv = argv_dup(argc - 1, argv_ + 1);
-
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
@@ -815,6 +818,10 @@ static int main_loop(int argc, const char **argv_) {
     ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
     ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
         num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (!ext_fb_list.ext_fb) {
+      fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n");
+      goto fail;
+    }
     if (vpx_codec_set_frame_buffer_functions(&decoder, get_vp9_frame_buffer,
                                              release_vp9_frame_buffer,
                                              &ext_fb_list)) {
@@ -930,6 +937,11 @@ static int main_loop(int argc, const char **argv_) {
           }
           scaled_img =
               vpx_img_alloc(NULL, img->fmt, render_width, render_height, 16);
+          if (!scaled_img) {
+            fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n",
+                    render_width, render_height);
+            goto fail;
+          }
           scaled_img->bit_depth = img->bit_depth;
         }
 
@@ -966,6 +978,10 @@ static int main_loop(int argc, const char **argv_) {
         if (!img_shifted) {
           img_shifted =
               vpx_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
+          if (!img_shifted) {
+            fprintf(stderr, "Failed to allocate image\n");
+            goto fail;
+          }
           img_shifted->bit_depth = output_bit_depth;
         }
         if (output_bit_depth > img->bit_depth) {
@@ -980,7 +996,7 @@ static int main_loop(int argc, const char **argv_) {
 
       if (single_file) {
         if (use_y4m) {
-          char buf[Y4M_BUFFER_SIZE] = { 0 };
+          char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
           size_t len = 0;
           if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) {
             fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n");
@@ -989,21 +1005,22 @@ static int main_loop(int argc, const char **argv_) {
           if (frame_out == 1) {
             // Y4M file header
             len = y4m_write_file_header(
-                buf, sizeof(buf), vpx_input_ctx.width, vpx_input_ctx.height,
-                &vpx_input_ctx.framerate, img->fmt, img->bit_depth);
+                y4m_buf, sizeof(y4m_buf), vpx_input_ctx.width,
+                vpx_input_ctx.height, &vpx_input_ctx.framerate, img->fmt,
+                img->bit_depth);
             if (do_md5) {
-              MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+              MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
             } else {
-              fputs(buf, outfile);
+              fputs(y4m_buf, outfile);
             }
           }
 
           // Y4M frame header
-          len = y4m_write_frame_header(buf, sizeof(buf));
+          len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
           if (do_md5) {
-            MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+            MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
           } else {
-            fputs(buf, outfile);
+            fputs(y4m_buf, outfile);
           }
         } else {
           if (frame_out == 1) {
@@ -1110,6 +1127,10 @@ int main(int argc, const char **argv_) {
   int error = 0;
 
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
diff --git a/vpxenc.c b/vpxenc.c
index 64288e83d2..d20bd3f967 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -58,8 +58,8 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
 
 static const char *exec_name;
 
-static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
-                                   const char *s, va_list ap) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv(
+    vpx_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) {
   if (ctx->err) {
     const char *detail = vpx_codec_error_detail(ctx);
 
@@ -72,7 +72,9 @@ static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
   }
 }
 
-static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) {
+static VPX_TOOLS_FORMAT_PRINTF(2,
+                               3) void ctx_exit_on_error(vpx_codec_ctx_t *ctx,
+                                                         const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
@@ -80,8 +82,8 @@ static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) {
   va_end(ap);
 }
 
-static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal,
-                                  const char *s, ...) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error(
+    vpx_codec_ctx_t *ctx, int fatal, const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
@@ -114,10 +116,6 @@ static const arg_def_t pass_arg =
     ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
 static const arg_def_t fpf_name =
     ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
-#if CONFIG_FP_MB_STATS
-static const arg_def_t fpmbf_name =
-    ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name");
-#endif
 static const arg_def_t limit =
     ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
 static const arg_def_t skip =
@@ -287,6 +285,64 @@ static const arg_def_t *rc_args[] = {
   &buf_sz,           &buf_initial_sz,     &buf_optimal_sz, NULL
 };
 
+#if CONFIG_VP9_ENCODER
+static const arg_def_t use_vizier_rc_params =
+    ARG_DEF(NULL, "use-vizier-rc-params", 1, "Use vizier rc params");
+static const arg_def_t active_wq_factor =
+    ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor");
+static const arg_def_t err_per_mb_factor =
+    ARG_DEF(NULL, "err-per-mb-factor", 1, "Error per macroblock factor");
+static const arg_def_t sr_default_decay_limit = ARG_DEF(
+    NULL, "sr-default-decay-limit", 1, "Second reference default decay limit");
+static const arg_def_t sr_diff_factor =
+    ARG_DEF(NULL, "sr-diff-factor", 1, "Second reference diff factor");
+static const arg_def_t kf_err_per_mb_factor = ARG_DEF(
+    NULL, "kf-err-per-mb-factor", 1, "Keyframe error per macroblock factor");
+static const arg_def_t kf_frame_min_boost_factor =
+    ARG_DEF(NULL, "kf-frame-min-boost-factor", 1, "Keyframe min boost");
+static const arg_def_t kf_frame_max_boost_first_factor =
+    ARG_DEF(NULL, "kf-frame-max-boost-first-factor", 1,
+            "Max keyframe boost adjustment factor for first frame");
+static const arg_def_t kf_frame_max_boost_subs_factor =
+    ARG_DEF(NULL, "kf-frame-max-boost-subs-factor", 1,
+            "Max boost adjustment factor for subsequent KFs");
+static const arg_def_t kf_max_total_boost_factor = ARG_DEF(
+    NULL, "kf-max-total-boost-factor", 1, "Keyframe max total boost factor");
+static const arg_def_t gf_max_total_boost_factor =
+    ARG_DEF(NULL, "gf-max-total-boost-factor", 1,
+            "Golden frame max total boost factor");
+static const arg_def_t gf_frame_max_boost_factor =
+    ARG_DEF(NULL, "gf-frame-max-boost-factor", 1,
+            "Golden frame max per frame boost factor");
+static const arg_def_t zm_factor =
+    ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor");
+static const arg_def_t rd_mult_inter_qp_fac =
+    ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1,
+            "RD multiplier adjustment for inter frames");
+static const arg_def_t rd_mult_arf_qp_fac =
+    ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1,
+            "RD multiplier adjustment for alt-ref frames");
+static const arg_def_t rd_mult_key_qp_fac = ARG_DEF(
+    NULL, "rd-mult-key-qp-fac", 1, "RD multiplier adjustment for key frames");
+static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params,
+                                             &active_wq_factor,
+                                             &err_per_mb_factor,
+                                             &sr_default_decay_limit,
+                                             &sr_diff_factor,
+                                             &kf_err_per_mb_factor,
+                                             &kf_frame_min_boost_factor,
+                                             &kf_frame_max_boost_first_factor,
+                                             &kf_frame_max_boost_subs_factor,
+                                             &kf_max_total_boost_factor,
+                                             &gf_max_total_boost_factor,
+                                             &gf_frame_max_boost_factor,
+                                             &zm_factor,
+                                             &rd_mult_inter_qp_fac,
+                                             &rd_mult_arf_qp_fac,
+                                             &rd_mult_key_qp_fac,
+                                             NULL };
+#endif
+
 static const arg_def_t bias_pct =
     ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
 static const arg_def_t minsection_pct =
@@ -465,6 +521,16 @@ static const arg_def_t target_level = ARG_DEF(
 static const arg_def_t row_mt =
     ARG_DEF(NULL, "row-mt", 1,
             "Enable row based non-deterministic multi-threading in VP9");
+
+static const arg_def_t disable_loopfilter =
+    ARG_DEF(NULL, "disable-loopfilter", 1,
+            "Control Loopfilter in VP9:\n"
+            "                                          "
+            "0: Loopfilter on for all frames (default)\n"
+            "                                          "
+            "1: Loopfilter off for non reference frames\n"
+            "                                          "
+            "2: Loopfilter off for all frames");
 #endif
 
 #if CONFIG_VP9_ENCODER
@@ -495,6 +561,10 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
                                        &max_gf_interval,
                                        &target_level,
                                        &row_mt,
+                                       &disable_loopfilter,
+// NOTE: The entries above have a corresponding entry in vp9_arg_ctrl_map. The
+// entries below do not have a corresponding entry in vp9_arg_ctrl_map. They
+// must be listed at the end of vp9_args.
 #if CONFIG_VP9_HIGHBITDEPTH
                                        &bitdeptharg,
                                        &inbitdeptharg,
@@ -527,6 +597,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP9E_SET_MAX_GF_INTERVAL,
                                         VP9E_SET_TARGET_LEVEL,
                                         VP9E_SET_ROW_MT,
+                                        VP9E_SET_DISABLE_LOOPFILTER,
                                         0 };
 #endif
 
@@ -561,6 +632,8 @@ static void show_help(FILE *fout, int shorthelp) {
 #if CONFIG_VP9_ENCODER
   fprintf(fout, "\nVP9 Specific Options:\n");
   arg_show_usage(fout, vp9_args);
+  fprintf(fout, "\nVizier Rate Control Options:\n");
+  arg_show_usage(fout, vizier_rc_args);
 #endif
   fprintf(fout,
           "\nStream timebase (--timebase):\n"
@@ -602,9 +675,6 @@ struct stream_config {
   struct vpx_codec_enc_cfg cfg;
   const char *out_fn;
   const char *stats_fn;
-#if CONFIG_FP_MB_STATS
-  const char *fpmb_stats_fn;
-#endif
   stereo_format_t stereo_fmt;
   int arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int arg_ctrl_cnt;
@@ -632,9 +702,6 @@ struct stream_state {
   uint64_t cx_time;
   size_t nbytes;
   stats_io_t stats;
-#if CONFIG_FP_MB_STATS
-  stats_io_t fpmb_stats;
-#endif
   struct vpx_image *img;
   vpx_codec_ctx_t decoder;
   int mismatch_seen;
@@ -816,7 +883,7 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global,
 
     /* Default lag_in_frames is 0 in realtime mode CBR mode*/
     if (global->deadline == VPX_DL_REALTIME &&
-        stream->config.cfg.rc_end_usage == 1)
+        stream->config.cfg.rc_end_usage == VPX_CBR)
       stream->config.cfg.g_lag_in_frames = 0;
   }
 
@@ -871,10 +938,6 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->out_fn = arg.val;
     } else if (arg_match(&arg, &fpf_name, argi)) {
       config->stats_fn = arg.val;
-#if CONFIG_FP_MB_STATS
-    } else if (arg_match(&arg, &fpmbf_name, argi)) {
-      config->fpmb_stats_fn = arg.val;
-#endif
     } else if (arg_match(&arg, &use_webm, argi)) {
 #if CONFIG_WEBM_IO
       config->write_webm = 1;
@@ -971,6 +1034,40 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
+#if CONFIG_VP9_ENCODER
+    } else if (arg_match(&arg, &use_vizier_rc_params, argi)) {
+      config->cfg.use_vizier_rc_params = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &active_wq_factor, argi)) {
+      config->cfg.active_wq_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &err_per_mb_factor, argi)) {
+      config->cfg.err_per_mb_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &sr_default_decay_limit, argi)) {
+      config->cfg.sr_default_decay_limit = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &sr_diff_factor, argi)) {
+      config->cfg.sr_diff_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_err_per_mb_factor, argi)) {
+      config->cfg.kf_err_per_mb_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_min_boost_factor, argi)) {
+      config->cfg.kf_frame_min_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_first_factor, argi)) {
+      config->cfg.kf_frame_max_boost_first_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_subs_factor, argi)) {
+      config->cfg.kf_frame_max_boost_subs_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_max_total_boost_factor, argi)) {
+      config->cfg.kf_max_total_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_max_total_boost_factor, argi)) {
+      config->cfg.gf_max_total_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_frame_max_boost_factor, argi)) {
+      config->cfg.gf_frame_max_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &zm_factor, argi)) {
+      config->cfg.zm_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_inter_qp_fac, argi)) {
+      config->cfg.rd_mult_inter_qp_fac = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_arf_qp_fac, argi)) {
+      config->cfg.rd_mult_arf_qp_fac = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_key_qp_fac, argi)) {
+      config->cfg.rd_mult_key_qp_fac = arg_parse_rational(&arg);
+#endif
 #if CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
       if (strcmp(global->codec->name, "vp9") == 0) {
@@ -1063,17 +1160,6 @@ static void validate_stream_config(const struct stream_state *stream,
         fatal("Stream %d: duplicate stats file (from stream %d)",
               streami->index, stream->index);
     }
-
-#if CONFIG_FP_MB_STATS
-    /* Check for two streams sharing a mb stats file. */
-    if (streami != stream) {
-      const char *a = stream->config.fpmb_stats_fn;
-      const char *b = streami->config.fpmb_stats_fn;
-      if (a && b && !strcmp(a, b))
-        fatal("Stream %d: duplicate mb stats file (from stream %d)",
-              streami->index, stream->index);
-    }
-#endif
   }
 }
 
@@ -1165,6 +1251,10 @@ static void show_stream_config(struct stream_state *stream,
   SHOW(kf_mode);
   SHOW(kf_min_dist);
   SHOW(kf_max_dist);
+  // Temporary use for debug
+  SHOW(use_vizier_rc_params);
+  SHOW(active_wq_factor.num);
+  SHOW(active_wq_factor.den);
 }
 
 static void open_output_file(struct stream_state *stream,
@@ -1228,26 +1318,11 @@ static void setup_pass(struct stream_state *stream,
       fatal("Failed to open statistics store");
   }
 
-#if CONFIG_FP_MB_STATS
-  if (stream->config.fpmb_stats_fn) {
-    if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn,
-                         pass))
-      fatal("Failed to open mb statistics store");
-  } else {
-    if (!stats_open_mem(&stream->fpmb_stats, pass))
-      fatal("Failed to open mb statistics store");
-  }
-#endif
-
   stream->config.cfg.g_pass = global->passes == 2
                                   ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS
                                   : VPX_RC_ONE_PASS;
   if (pass) {
     stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
-#if CONFIG_FP_MB_STATS
-    stream->config.cfg.rc_firstpass_mb_stats_in =
-        stats_get(&stream->fpmb_stats);
-#endif
   }
 
   stream->cx_time = 0;
@@ -1459,13 +1534,6 @@ static void get_cx_data(struct stream_state *stream,
                     pkt->data.twopass_stats.sz);
         stream->nbytes += pkt->data.raw.sz;
         break;
-#if CONFIG_FP_MB_STATS
-      case VPX_CODEC_FPMB_STATS_PKT:
-        stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf,
-                    pkt->data.firstpass_mb_stats.sz);
-        stream->nbytes += pkt->data.raw.sz;
-        break;
-#endif
       case VPX_CODEC_PSNR_PKT:
 
         if (global->show_psnr) {
@@ -1518,13 +1586,14 @@ static void test_decode(struct stream_state *stream,
   /* Get the internal reference frame */
   if (strcmp(codec->name, "vp8") == 0) {
     struct vpx_ref_frame ref_enc, ref_dec;
-    int width, height;
+    int aligned_width = (stream->config.cfg.g_w + 15) & ~15;
+    int aligned_height = (stream->config.cfg.g_h + 15) & ~15;
 
-    width = (stream->config.cfg.g_w + 15) & ~15;
-    height = (stream->config.cfg.g_h + 15) & ~15;
-    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1);
+    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, aligned_width, aligned_height,
+                  1);
     enc_img = ref_enc.img;
-    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1);
+    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, aligned_width, aligned_height,
+                  1);
     dec_img = ref_dec.img;
 
     ref_enc.frame_type = VP8_LAST_FRAME;
@@ -1624,6 +1693,7 @@ int main(int argc, const char **argv_) {
   int res = 0;
 
   memset(&input, 0, sizeof(input));
+  memset(&raw, 0, sizeof(raw));
   exec_name = argv_[0];
 
   /* Setup default input stream settings */
@@ -1637,6 +1707,10 @@ int main(int argc, const char **argv_) {
    * codec.
    */
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   parse_global_config(&global, argv);
 
   if (argc < 3) usage_exit();
@@ -1769,14 +1843,10 @@ int main(int argc, const char **argv_) {
       FOREACH_STREAM(show_stream_config(stream, &global, &input));
 
     if (pass == (global.pass ? global.pass - 1 : 0)) {
-      if (input.file_type == FILE_TYPE_Y4M)
-        /*The Y4M reader does its own allocation.
-          Just initialize this here to avoid problems if we never read any
-           frames.*/
-        memset(&raw, 0, sizeof(raw));
-      else
+      // The Y4M reader does its own allocation.
+      if (input.file_type != FILE_TYPE_Y4M) {
         vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32);
-
+      }
       FOREACH_STREAM(stream->rate_hist = init_rate_histogram(
                          &stream->config.cfg, &global.framerate));
     }
@@ -1900,10 +1970,9 @@ int main(int argc, const char **argv_) {
           } else {
             const int64_t input_pos = ftello(input.file);
             const int64_t input_pos_lagged = input_pos - lagged_count;
-            const int64_t limit = input.length;
 
             rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0;
-            remaining = limit - input_pos + lagged_count;
+            remaining = input.length - input_pos + lagged_count;
           }
 
           average_rate =
@@ -1962,10 +2031,6 @@ int main(int argc, const char **argv_) {
 
     FOREACH_STREAM(stats_close(&stream->stats, global.passes - 1));
 
-#if CONFIG_FP_MB_STATS
-    FOREACH_STREAM(stats_close(&stream->fpmb_stats, global.passes - 1));
-#endif
-
     if (global.pass) break;
   }
 
diff --git a/vpxstats.c b/vpxstats.c
index 142e367bb4..c0dd14e450 100644
--- a/vpxstats.c
+++ b/vpxstats.c
@@ -41,7 +41,7 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
     stats->buf.buf = malloc(stats->buf_alloc_sz);
 
     if (!stats->buf.buf)
-      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+      fatal("Failed to allocate first-pass stats buffer (%u bytes)",
             (unsigned int)stats->buf_alloc_sz);
 
     nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
diff --git a/warnings.c b/warnings.c
index a80da527f7..3e6e702536 100644
--- a/warnings.c
+++ b/warnings.c
@@ -98,7 +98,7 @@ void check_encoder_config(int disable_prompt,
   /* Count and print warnings. */
   for (warning = warning_list.warning_node; warning != NULL;
        warning = warning->next_warning, ++num_warnings) {
-    warn(warning->warning_string);
+    warn("%s", warning->warning_string);
   }
 
   free_warning_list(&warning_list);
diff --git a/webmdec.cc b/webmdec.cc
index d609075a93..f7671bb641 100644
--- a/webmdec.cc
+++ b/webmdec.cc
@@ -19,25 +19,25 @@
 namespace {
 
 void reset(struct WebmInputContext *const webm_ctx) {
-  if (webm_ctx->reader != NULL) {
+  if (webm_ctx->reader != nullptr) {
     mkvparser::MkvReader *const reader =
         reinterpret_cast<mkvparser::MkvReader *>(webm_ctx->reader);
     delete reader;
   }
-  if (webm_ctx->segment != NULL) {
+  if (webm_ctx->segment != nullptr) {
     mkvparser::Segment *const segment =
         reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
     delete segment;
   }
-  if (webm_ctx->buffer != NULL) {
+  if (webm_ctx->buffer != nullptr) {
     delete[] webm_ctx->buffer;
   }
-  webm_ctx->reader = NULL;
-  webm_ctx->segment = NULL;
-  webm_ctx->buffer = NULL;
-  webm_ctx->cluster = NULL;
-  webm_ctx->block_entry = NULL;
-  webm_ctx->block = NULL;
+  webm_ctx->reader = nullptr;
+  webm_ctx->segment = nullptr;
+  webm_ctx->buffer = nullptr;
+  webm_ctx->cluster = nullptr;
+  webm_ctx->block_entry = nullptr;
+  webm_ctx->block = nullptr;
   webm_ctx->block_frame_index = 0;
   webm_ctx->video_track_index = 0;
   webm_ctx->timestamp_ns = 0;
@@ -84,7 +84,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
   }
 
   const mkvparser::Tracks *const tracks = segment->GetTracks();
-  const mkvparser::VideoTrack *video_track = NULL;
+  const mkvparser::VideoTrack *video_track = nullptr;
   for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) {
     const mkvparser::Track *const track = tracks->GetTrackByIndex(i);
     if (track->GetType() == mkvparser::Track::kVideo) {
@@ -94,7 +94,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
     }
   }
 
-  if (video_track == NULL || video_track->GetCodecId() == NULL) {
+  if (video_track == nullptr || video_track->GetCodecId() == nullptr) {
     rewind_and_reset(webm_ctx, vpx_ctx);
     return 0;
   }
@@ -137,12 +137,12 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
   do {
     long status = 0;
     bool get_new_block = false;
-    if (block_entry == NULL && !block_entry_eos) {
+    if (block_entry == nullptr && !block_entry_eos) {
       status = cluster->GetFirst(block_entry);
       get_new_block = true;
     } else if (block_entry_eos || block_entry->EOS()) {
       cluster = segment->GetNext(cluster);
-      if (cluster == NULL || cluster->EOS()) {
+      if (cluster == nullptr || cluster->EOS()) {
         *buffer_size = 0;
         webm_ctx->reached_eos = 1;
         return 1;
@@ -150,22 +150,22 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
       status = cluster->GetFirst(block_entry);
       block_entry_eos = false;
       get_new_block = true;
-    } else if (block == NULL ||
+    } else if (block == nullptr ||
                webm_ctx->block_frame_index == block->GetFrameCount() ||
                block->GetTrackNumber() != webm_ctx->video_track_index) {
       status = cluster->GetNext(block_entry, block_entry);
-      if (block_entry == NULL || block_entry->EOS()) {
+      if (block_entry == nullptr || block_entry->EOS()) {
         block_entry_eos = true;
         continue;
       }
       get_new_block = true;
     }
-    if (status || block_entry == NULL) {
+    if (status || block_entry == nullptr) {
       return -1;
     }
     if (get_new_block) {
       block = block_entry->GetBlock();
-      if (block == NULL) return -1;
+      if (block == nullptr) return -1;
       webm_ctx->block_frame_index = 0;
     }
   } while (block_entry_eos ||
@@ -181,7 +181,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
   if (frame.len > static_cast<long>(*buffer_size)) {
     delete[] * buffer;
     *buffer = new uint8_t[frame.len];
-    if (*buffer == NULL) {
+    if (*buffer == nullptr) {
       return -1;
     }
     webm_ctx->buffer = *buffer;
@@ -198,7 +198,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
 int webm_guess_framerate(struct WebmInputContext *webm_ctx,
                          struct VpxInputContext *vpx_ctx) {
   uint32_t i = 0;
-  uint8_t *buffer = NULL;
+  uint8_t *buffer = nullptr;
   size_t buffer_size = 0;
   while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
     if (webm_read_frame(webm_ctx, &buffer, &buffer_size)) {
@@ -210,10 +210,12 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx,
   vpx_ctx->framerate.denominator =
       static_cast<int>(webm_ctx->timestamp_ns / 1000);
   delete[] buffer;
+  // webm_ctx->buffer is assigned to the buffer pointer in webm_read_frame().
+  webm_ctx->buffer = nullptr;
 
   get_first_cluster(webm_ctx);
-  webm_ctx->block = NULL;
-  webm_ctx->block_entry = NULL;
+  webm_ctx->block = nullptr;
+  webm_ctx->block_entry = nullptr;
   webm_ctx->block_frame_index = 0;
   webm_ctx->timestamp_ns = 0;
   webm_ctx->reached_eos = 0;
diff --git a/webmdec.h b/webmdec.h
index d8618b07d6..6ae7ee16d0 100644
--- a/webmdec.h
+++ b/webmdec.h
@@ -27,7 +27,7 @@ struct WebmInputContext {
   const void *block;
   int block_frame_index;
   int video_track_index;
-  uint64_t timestamp_ns;
+  int64_t timestamp_ns;
   int is_key_frame;
   int reached_eos;
 };
diff --git a/webmenc.cc b/webmenc.cc
index 66606674b0..c718ab5a9f 100644
--- a/webmenc.cc
+++ b/webmenc.cc
@@ -90,6 +90,6 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx) {
   segment->Finalize();
   delete segment;
   delete writer;
-  webm_ctx->writer = NULL;
-  webm_ctx->segment = NULL;
+  webm_ctx->writer = nullptr;
+  webm_ctx->segment = nullptr;
 }
diff --git a/y4menc.c b/y4menc.c
index 02b729e5bb..1877981279 100644
--- a/y4menc.c
+++ b/y4menc.c
@@ -17,39 +17,34 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height,
   const char *color;
   switch (bit_depth) {
     case 8:
-      color = fmt == VPX_IMG_FMT_I444
-                  ? "C444\n"
-                  : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n";
+      color = fmt == VPX_IMG_FMT_I444   ? "C444\n"
+              : fmt == VPX_IMG_FMT_I422 ? "C422\n"
+                                        : "C420jpeg\n";
       break;
     case 9:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p9 XYSCSS=444P9\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
-                                              : "C420p9 XYSCSS=420P9\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p9 XYSCSS=444P9\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
+                                          : "C420p9 XYSCSS=420P9\n";
       break;
     case 10:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p10 XYSCSS=444P10\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
-                                              : "C420p10 XYSCSS=420P10\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p10 XYSCSS=444P10\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
+                                          : "C420p10 XYSCSS=420P10\n";
       break;
     case 12:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p12 XYSCSS=444P12\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
-                                              : "C420p12 XYSCSS=420P12\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p12 XYSCSS=444P12\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
+                                          : "C420p12 XYSCSS=420P12\n";
       break;
     case 14:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p14 XYSCSS=444P14\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
-                                              : "C420p14 XYSCSS=420P14\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p14 XYSCSS=444P14\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
+                                          : "C420p14 XYSCSS=420P14\n";
       break;
     case 16:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p16 XYSCSS=444P16\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
-                                              : "C420p16 XYSCSS=420P16\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p16 XYSCSS=444P16\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
+                                          : "C420p16 XYSCSS=420P16\n";
       break;
     default: color = NULL; assert(0);
   }
diff --git a/y4minput.c b/y4minput.c
index 007bd9971b..210ce52fce 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -10,6 +10,7 @@
  *  Based on code from the OggTheora software codec source code,
  *  Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
  */
+#include <assert.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
@@ -20,12 +21,13 @@
 // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
 // Returns true on success.
 static int file_read(void *buf, size_t size, FILE *file) {
-  const int kMaxRetries = 5;
-  int retry_count = 0;
-  int file_error;
+  const int kMaxTries = 5;
+  int try_count = 0;
+  int file_error = 0;
   size_t len = 0;
-  do {
+  while (!feof(file) && len < size && try_count < kMaxTries) {
     const size_t n = fread((uint8_t *)buf + len, 1, size - len, file);
+    ++try_count;
     len += n;
     file_error = ferror(file);
     if (file_error) {
@@ -38,28 +40,21 @@ static int file_read(void *buf, size_t size, FILE *file) {
         return 0;
       }
     }
-  } while (!feof(file) && len < size && ++retry_count < kMaxRetries);
+  }
 
   if (!feof(file) && len != size) {
     fprintf(stderr,
             "Error reading file: %u of %u bytes read,"
-            " error: %d, retries: %d, %d: %s\n",
-            (uint32_t)len, (uint32_t)size, file_error, retry_count, errno,
+            " error: %d, tries: %d, %d: %s\n",
+            (uint32_t)len, (uint32_t)size, file_error, try_count, errno,
             strerror(errno));
   }
   return len == size;
 }
 
 static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
-  int got_w;
-  int got_h;
-  int got_fps;
-  int got_interlace;
-  int got_par;
-  int got_chroma;
   char *p;
   char *q;
-  got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0;
   for (p = _tags;; p = q) {
     /*Skip any leading spaces.*/
     while (*p == ' ') p++;
@@ -72,52 +67,119 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
     switch (p[0]) {
       case 'W': {
         if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1;
-        got_w = 1;
         break;
       }
       case 'H': {
         if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1;
-        got_h = 1;
         break;
       }
       case 'F': {
         if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
           return -1;
         }
-        got_fps = 1;
         break;
       }
       case 'I': {
         _y4m->interlace = p[1];
-        got_interlace = 1;
         break;
       }
       case 'A': {
         if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
           return -1;
         }
-        got_par = 1;
         break;
       }
       case 'C': {
         if (q - p > 16) return -1;
         memcpy(_y4m->chroma_type, p + 1, q - p - 1);
         _y4m->chroma_type[q - p - 1] = '\0';
-        got_chroma = 1;
         break;
       }
         /*Ignore unknown tags.*/
     }
   }
-  if (!got_w || !got_h || !got_fps) return -1;
-  if (!got_interlace) _y4m->interlace = '?';
-  if (!got_par) _y4m->par_n = _y4m->par_d = 0;
-  /*Chroma-type is not specified in older files, e.g., those generated by
-     mplayer.*/
-  if (!got_chroma) strcpy(_y4m->chroma_type, "420");
   return 0;
 }
 
+// Copy a single tag into the buffer, along with a null character.
+// Returns 0 if any file IO errors occur.
+static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) {
+  size_t i;
+  assert(buf_len >= 1);
+  // Skip leading space characters.
+  do {
+    if (!file_read(buf, 1, file)) {
+      return 0;
+    }
+  } while (buf[0] == ' ');
+
+  // If we hit the newline, treat this as the "empty" tag.
+  if (buf[0] == '\n') {
+    buf[0] = '\0';
+    *end_tag = '\n';
+    return 1;
+  }
+
+  // Copy over characters until a space is hit, or the buffer is exhausted.
+  for (i = 1; i < buf_len; ++i) {
+    if (!file_read(buf + i, 1, file)) {
+      return 0;
+    }
+    if (buf[i] == ' ' || buf[i] == '\n') {
+      break;
+    }
+  }
+  if (i == buf_len) {
+    fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n",
+            (unsigned long)i);
+    return 0;
+  }
+  *end_tag = buf[i];
+  buf[i] = '\0';
+  return 1;
+}
+
+/* Returns 1 if tags were parsed successfully, 0 otherwise. */
+static int parse_tags(y4m_input *y4m_ctx, FILE *file) {
+  char tag[256];
+  char end; /* Character denoting the end of the tag, ' ' or '\n'. */
+  /* Set Y4M tags to defaults, updating them as processing occurs. Mandatory
+     fields are marked with -1 and will be checked after the tags are parsed. */
+  y4m_ctx->pic_w = -1;
+  y4m_ctx->pic_h = -1;
+  y4m_ctx->fps_n = -1; /* Also serves as marker for fps_d */
+  y4m_ctx->par_n = 0;
+  y4m_ctx->par_d = 0;
+  y4m_ctx->interlace = '?';
+  snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420");
+
+  /* Find one tag at a time. */
+  do {
+    if (!copy_tag(tag, sizeof(tag), &end, file)) {
+      return 0;
+    }
+    /* y4m_parse_tags returns 0 on success. */
+    if (y4m_parse_tags(y4m_ctx, tag)) {
+      return 0;
+    }
+  } while (end != '\n');
+
+  /* Check the mandatory fields. */
+  if (y4m_ctx->pic_w == -1) {
+    fprintf(stderr, "Width field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->pic_h == -1) {
+    fprintf(stderr, "Height field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->fps_n == -1) {
+    fprintf(stderr, "FPS field missing\n");
+    return 0;
+  }
+  return 1;
+}
+
 /*All anti-aliasing filters in the following conversion functions are based on
    one of two window functions:
   The 6-tap Lanczos window (for down-sampling and shifts):
@@ -224,26 +286,6 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
   }
 }
 
-/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/
-static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
-                                         unsigned char *_aux) {
-  int c_w;
-  int c_h;
-  int c_sz;
-  int pli;
-  /*Skip past the luma data.*/
-  _dst += _y4m->pic_w * _y4m->pic_h;
-  /*Compute the size of each chroma plane.*/
-  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
-  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
-  c_sz = c_w * c_h;
-  for (pli = 1; pli < 3; pli++) {
-    y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h);
-    _dst += c_sz;
-    _aux += c_sz;
-  }
-}
-
 /*This format is only used for interlaced content, but is included for
    completeness.
 
@@ -784,277 +826,277 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
   (void)_aux;
 }
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   int only_420) {
-  char buffer[80] = { 0 };
-  int ret;
-  int i;
-  /*Read until newline, or 80 cols, whichever happens first.*/
-  for (i = 0; i < 79; i++) {
-    if (_nskip > 0) {
-      buffer[i] = *_skip++;
-      _nskip--;
-    } else {
-      if (!file_read(buffer + i, 1, _fin)) return -1;
-    }
-    if (buffer[i] == '\n') break;
+static const char TAG[] = "YUV4MPEG2";
+
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, int only_420) {
+  // File must start with |TAG|.
+  char tag_buffer[9];  // 9 == strlen(TAG)
+  // Read as much as possible from |skip_buffer|, which were characters
+  // that were previously read from the file to do input-type detection.
+  assert(num_skip >= 0 && num_skip <= 8);
+  if (num_skip > 0) {
+    memcpy(tag_buffer, skip_buffer, num_skip);
   }
-  /*We skipped too much header data.*/
-  if (_nskip > 0) return -1;
-  if (i == 79) {
-    fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n");
+  // Start reading from the file now that the |skip_buffer| is depleted.
+  if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) {
     return -1;
   }
-  buffer[i] = '\0';
-  if (memcmp(buffer, "YUV4MPEG", 8)) {
-    fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n");
+  if (memcmp(TAG, tag_buffer, 9) != 0) {
+    fprintf(stderr, "Error parsing header: must start with %s\n", TAG);
     return -1;
   }
-  if (buffer[8] != '2') {
-    fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n");
+  // Next character must be a space.
+  if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') {
+    fprintf(stderr, "Error parsing header: space must follow %s\n", TAG);
+    return -1;
   }
-  ret = y4m_parse_tags(_y4m, buffer + 5);
-  if (ret < 0) {
-    fprintf(stderr, "Error parsing YUV4MPEG2 header.\n");
-    return ret;
+  if (!parse_tags(y4m_ctx, file)) {
+    fprintf(stderr, "Error parsing %s header.\n", TAG);
   }
-  if (_y4m->interlace == '?') {
+  if (y4m_ctx->interlace == '?') {
     fprintf(stderr,
             "Warning: Input video interlacing format unknown; "
             "assuming progressive scan.\n");
-  } else if (_y4m->interlace != 'p') {
+  } else if (y4m_ctx->interlace != 'p') {
     fprintf(stderr,
             "Input video is interlaced; "
             "Only progressive scan handled.\n");
     return -1;
   }
-  _y4m->vpx_fmt = VPX_IMG_FMT_I420;
-  _y4m->bps = 12;
-  _y4m->bit_depth = 8;
-  if (strcmp(_y4m->chroma_type, "420") == 0 ||
-      strcmp(_y4m->chroma_type, "420jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        _y4m->pic_w * _y4m->pic_h +
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+  y4m_ctx->vpx_fmt = VPX_IMG_FMT_I420;
+  y4m_ctx->bps = 12;
+  y4m_ctx->bit_depth = 8;
+  y4m_ctx->aux_buf = NULL;
+  y4m_ctx->dst_buf = NULL;
+  if (strcmp(y4m_ctx->chroma_type, "420") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        y4m_ctx->pic_w * y4m_ctx->pic_h +
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-  } else if (strcmp(_y4m->chroma_type, "420p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 10;
-    _y4m->bps = 15;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->bps = 15;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+  } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 12;
-    _y4m->bps = 18;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->bps = 18;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_42xmpeg2_42xjpeg;
-  } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_42xpaldv_42xjpeg;
-  } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    y4m_ctx->aux_buf_sz =
+        3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_422jpeg_420jpeg;
-  } else if (strcmp(_y4m->chroma_type, "422") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_422jpeg_420jpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_422_420jpeg;
+      y4m_ctx->aux_buf_read_sz =
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_422_420jpeg;
     } else {
-      _y4m->vpx_fmt = VPX_IMG_FMT_I422;
-      _y4m->bps = 16;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz =
-          _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      y4m_ctx->vpx_fmt = VPX_IMG_FMT_I422;
+      y4m_ctx->bps = 16;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz =
+          y4m_ctx->pic_w * y4m_ctx->pic_h +
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
-    _y4m->bps = 20;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216;
+    y4m_ctx->bps = 20;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
-    _y4m->bps = 24;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216;
+    y4m_ctx->bps = 24;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "411") == 0) {
-    _y4m->src_c_dec_h = 4;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) {
+    y4m_ctx->src_c_dec_h = 4;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h;
-    _y4m->aux_buf_sz =
-        _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_411_420jpeg;
+    y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz =
+        y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_411_420jpeg;
     fprintf(stderr, "Unsupported conversion from yuv 411\n");
     return -1;
-  } else if (strcmp(_y4m->chroma_type, "444") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
+  } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_444_420jpeg;
+      y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_444_420jpeg;
     } else {
-      _y4m->vpx_fmt = VPX_IMG_FMT_I444;
-      _y4m->bps = 24;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->vpx_fmt = VPX_IMG_FMT_I444;
+      y4m_ctx->bps = 24;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p10") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
-    _y4m->bps = 30;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416;
+    y4m_ctx->bps = 30;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p12") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
-    _y4m->bps = 36;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416;
+    y4m_ctx->bps = 36;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
-    _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
-    _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*No extra space required, but we need to clear the chroma planes.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_mono_420jpeg;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_mono_420jpeg;
   } else {
-    fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type);
+    fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type);
     return -1;
   }
   /*The size of the final frame buffers is always computed from the
      destination chroma decimation type.*/
-  _y4m->dst_buf_sz =
-      _y4m->pic_w * _y4m->pic_h +
-      2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
-          ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
-  if (_y4m->bit_depth == 8)
-    _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  y4m_ctx->dst_buf_sz =
+      y4m_ctx->pic_w * y4m_ctx->pic_h +
+      2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) *
+          ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v);
+  if (y4m_ctx->bit_depth == 8)
+    y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz);
   else
-    _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
+    y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz);
+  if (!y4m_ctx->dst_buf) return -1;
 
-  if (_y4m->aux_buf_sz > 0)
-    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+  if (y4m_ctx->aux_buf_sz > 0) {
+    y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz);
+    if (!y4m_ctx->aux_buf) {
+      free(y4m_ctx->dst_buf);
+      return -1;
+    }
+  }
   return 0;
 }
 
@@ -1106,6 +1148,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
   _img->fmt = _y4m->vpx_fmt;
   _img->w = _img->d_w = _y4m->pic_w;
   _img->h = _img->d_h = _y4m->pic_h;
+  _img->bit_depth = _y4m->bit_depth;
   _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
   _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
   _img->bps = _y4m->bps;
diff --git a/y4minput.h b/y4minput.h
index a4a8b18dc5..573750d749 100644
--- a/y4minput.h
+++ b/y4minput.h
@@ -56,8 +56,16 @@ struct y4m_input {
   unsigned int bit_depth;
 };
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   int only_420);
+/**
+ * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after
+ * reading it. The |skip_buffer| indicates bytes that were previously read
+ * from |file|, to do input-type detection; this buffer will be read before
+ * the |file| is read. It is of size |num_skip|, which *must* be 8 or less.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, int only_420);
 void y4m_input_close(y4m_input *_y4m);
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);